mirror of https://mirror.osredm.com/root/redis.git
HNSW: creation time M parameter VS hardcoded.
This commit is contained in:
parent
0258e85186
commit
f829d46535
49
hnsw.c
49
hnsw.c
|
@ -54,12 +54,6 @@
|
||||||
|
|
||||||
/* Algorithm parameters. */
|
/* Algorithm parameters. */
|
||||||
|
|
||||||
#define HNSW_M 16 /* Number of max connections per node. Note that
|
|
||||||
* layer zero has twice as many. Also note that
|
|
||||||
* when a new node is added, we will populate
|
|
||||||
* even layer 0 links to just HNSW_M neighbors, so
|
|
||||||
* initially half layer 0 slots will be empty. */
|
|
||||||
#define HNSW_M0 (HNSW_M*2) /* Maximum number of connections for layer 0 */
|
|
||||||
#define HNSW_P 0.25 /* Probability of level increase. */
|
#define HNSW_P 0.25 /* Probability of level increase. */
|
||||||
#define HNSW_MAX_LEVEL 16 /* Max level nodes can reach. */
|
#define HNSW_MAX_LEVEL 16 /* Max level nodes can reach. */
|
||||||
#define HNSW_EF_C 200 /* Default size of dynamic candidate list while
|
#define HNSW_EF_C 200 /* Default size of dynamic candidate list while
|
||||||
|
@ -68,7 +62,8 @@
|
||||||
* used when deleting nodes for the search step
|
* used when deleting nodes for the search step
|
||||||
* needed sometimes to reconnect nodes that remain
|
* needed sometimes to reconnect nodes that remain
|
||||||
* orphaned of one link. */
|
* orphaned of one link. */
|
||||||
|
#define HNSW_DEFAULT_M 16 /* Useful if 0 is given at creation time. */
|
||||||
|
#define HNSW_MAX_M 1024 /* Hard limit for M. */
|
||||||
|
|
||||||
static void (*hfree)(void *p) = free;
|
static void (*hfree)(void *p) = free;
|
||||||
static void *(*hmalloc)(size_t s) = malloc;
|
static void *(*hmalloc)(size_t s) = malloc;
|
||||||
|
@ -391,10 +386,15 @@ uint32_t random_level() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Create new HNSW index, quantized or not. */
|
/* Create new HNSW index, quantized or not. */
|
||||||
HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type) {
|
HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type, uint32_t m) {
|
||||||
HNSW *index = hmalloc(sizeof(HNSW));
|
HNSW *index = hmalloc(sizeof(HNSW));
|
||||||
if (!index) return NULL;
|
if (!index) return NULL;
|
||||||
|
|
||||||
|
/* M parameter sanity check. */
|
||||||
|
if (m == 0) m = HNSW_DEFAULT_M;
|
||||||
|
else if (m > HNSW_MAX_M) m = HNSW_MAX_M;
|
||||||
|
|
||||||
|
index->M = m;
|
||||||
index->quant_type = quant_type;
|
index->quant_type = quant_type;
|
||||||
index->enter_point = NULL;
|
index->enter_point = NULL;
|
||||||
index->max_level = 0;
|
index->max_level = 0;
|
||||||
|
@ -556,7 +556,7 @@ hnswNode *hnsw_node_new(HNSW *index, uint64_t id, const float *vector, const int
|
||||||
|
|
||||||
/* Initialize each layer. */
|
/* Initialize each layer. */
|
||||||
for (uint32_t i = 0; i <= level; i++) {
|
for (uint32_t i = 0; i <= level; i++) {
|
||||||
uint32_t max_links = (i == 0) ? HNSW_M0 : HNSW_M;
|
uint32_t max_links = (i == 0) ? index->M*2 : index->M;
|
||||||
node->layers[i].max_links = max_links;
|
node->layers[i].max_links = max_links;
|
||||||
node->layers[i].num_links = 0;
|
node->layers[i].num_links = 0;
|
||||||
node->layers[i].worst_distance = 0;
|
node->layers[i].worst_distance = 0;
|
||||||
|
@ -939,7 +939,7 @@ void hnsw_update_worst_neighbor_on_remove(HNSW *index, hnswNode *node, uint32_t
|
||||||
void select_neighbors(HNSW *index, pqueue *candidates, hnswNode *new_node,
|
void select_neighbors(HNSW *index, pqueue *candidates, hnswNode *new_node,
|
||||||
uint32_t layer, uint32_t required_links, int aggressive)
|
uint32_t layer, uint32_t required_links, int aggressive)
|
||||||
{
|
{
|
||||||
uint32_t max_links = (layer == 0) ? HNSW_M0 : HNSW_M;
|
uint32_t max_links = (layer == 0) ? index->M*2 : index->M;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < candidates->count; i++) {
|
for (uint32_t i = 0; i < candidates->count; i++) {
|
||||||
hnswNode *neighbor = pq_get_node(candidates,i);
|
hnswNode *neighbor = pq_get_node(candidates,i);
|
||||||
|
@ -1031,13 +1031,13 @@ void select_neighbors(HNSW *index, pqueue *candidates, hnswNode *new_node,
|
||||||
* if we remove the candidate node as its link. Let's check if
|
* if we remove the candidate node as its link. Let's check if
|
||||||
* this is the case: */
|
* this is the case: */
|
||||||
if (aggressive == 0 &&
|
if (aggressive == 0 &&
|
||||||
worst_node->layers[layer].num_links <= HNSW_M/2)
|
worst_node->layers[layer].num_links <= index->M/2)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* Aggressive level = 1. It's ok if the node remains with just
|
/* Aggressive level = 1. It's ok if the node remains with just
|
||||||
* HNSW_M/4 links. */
|
* HNSW_M/4 links. */
|
||||||
else if (aggressive == 1 &&
|
else if (aggressive == 1 &&
|
||||||
worst_node->layers[layer].num_links <= HNSW_M/4)
|
worst_node->layers[layer].num_links <= index->M/4)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* If aggressive is set to 2, then the new node we are adding failed
|
/* If aggressive is set to 2, then the new node we are adding failed
|
||||||
|
@ -1045,7 +1045,8 @@ void select_neighbors(HNSW *index, pqueue *candidates, hnswNode *new_node,
|
||||||
* node, so let's see if the target node has some other link
|
* node, so let's see if the target node has some other link
|
||||||
* that is well connected in the graph: we could drop it instead
|
* that is well connected in the graph: we could drop it instead
|
||||||
* of the worst link. */
|
* of the worst link. */
|
||||||
if (aggressive == 2 && worst_node->layers[layer].num_links <= HNSW_M/4)
|
if (aggressive == 2 && worst_node->layers[layer].num_links <=
|
||||||
|
index->M/4)
|
||||||
{
|
{
|
||||||
/* Let's see if we can find at least a candidate link that
|
/* Let's see if we can find at least a candidate link that
|
||||||
* would remain with a few connections. Track the one
|
* would remain with a few connections. Track the one
|
||||||
|
@ -1059,13 +1060,13 @@ void select_neighbors(HNSW *index, pqueue *candidates, hnswNode *new_node,
|
||||||
|
|
||||||
/* Skip this if it would remain too disconnected as well.
|
/* Skip this if it would remain too disconnected as well.
|
||||||
*
|
*
|
||||||
* NOTE about HNSW_M/4 min connections requirement:
|
* NOTE about index->M/4 min connections requirement:
|
||||||
*
|
*
|
||||||
* It is not too strict, since leaving a node with just a
|
* It is not too strict, since leaving a node with just a
|
||||||
* single link does not just leave it too weakly connected, but
|
* single link does not just leave it too weakly connected, but
|
||||||
* also sometimes creates cycles with few disconnected
|
* also sometimes creates cycles with few disconnected
|
||||||
* nodes linked among them. */
|
* nodes linked among them. */
|
||||||
if (to_drop->layers[layer].num_links <= HNSW_M/4) continue;
|
if (to_drop->layers[layer].num_links <= index->M/4) continue;
|
||||||
|
|
||||||
float link_dist = hnsw_distance(index, neighbor, to_drop);
|
float link_dist = hnsw_distance(index, neighbor, to_drop);
|
||||||
if (worst_node == NULL || link_dist > max_dist) {
|
if (worst_node == NULL || link_dist > max_dist) {
|
||||||
|
@ -1401,8 +1402,10 @@ void hnsw_reconnect_nodes(HNSW *index, hnswNode **nodes, int count, uint32_t lay
|
||||||
/* Try to connect with aggressiveness proportional to the
|
/* Try to connect with aggressiveness proportional to the
|
||||||
* node linking condition. */
|
* node linking condition. */
|
||||||
int aggressiveness =
|
int aggressiveness =
|
||||||
nodes[i]->layers[layer].num_links > HNSW_M / 2 ? 1 : 2;
|
(nodes[i]->layers[layer].num_links > index->M / 2)
|
||||||
select_neighbors(index, candidates, nodes[i], layer, wanted_links, aggressiveness);
|
? 1 : 2;
|
||||||
|
select_neighbors(index, candidates, nodes[i], layer,
|
||||||
|
wanted_links, aggressiveness);
|
||||||
debugmsg("Final links with broader search: %d (wanted: %d)\n", (int)nodes[i]->layers[layer].num_links, wanted_links);
|
debugmsg("Final links with broader search: %d (wanted: %d)\n", (int)nodes[i]->layers[layer].num_links, wanted_links);
|
||||||
pq_free(candidates);
|
pq_free(candidates);
|
||||||
}
|
}
|
||||||
|
@ -1729,25 +1732,25 @@ hnswNode *hnsw_commit_insert_nolock(HNSW *index, InsertContext *ctx) {
|
||||||
for (int lc = MIN(node->level,index->max_level); lc >= 0; lc--) {
|
for (int lc = MIN(node->level,index->max_level); lc >= 0; lc--) {
|
||||||
if (ctx->level_queues[lc] == NULL) continue;
|
if (ctx->level_queues[lc] == NULL) continue;
|
||||||
|
|
||||||
/* Try to provide HNSW_M connections to our node. The call
|
/* Try to provide index->M connections to our node. The call
|
||||||
* is not guaranteed to be able to provide all the links we would
|
* is not guaranteed to be able to provide all the links we would
|
||||||
* like to have for the new node: they must be bi-directional, obey
|
* like to have for the new node: they must be bi-directional, obey
|
||||||
* certain quality checks, and so forth, so later there are further
|
* certain quality checks, and so forth, so later there are further
|
||||||
* calls to force the hand a bit if needed.
|
* calls to force the hand a bit if needed.
|
||||||
*
|
*
|
||||||
* Let's start with aggressiveness = 0. */
|
* Let's start with aggressiveness = 0. */
|
||||||
select_neighbors(index, ctx->level_queues[lc], node, lc, HNSW_M, 0);
|
select_neighbors(index, ctx->level_queues[lc], node, lc, index->M, 0);
|
||||||
|
|
||||||
/* Layer 0 and too few connections? Let's be more aggressive. */
|
/* Layer 0 and too few connections? Let's be more aggressive. */
|
||||||
if (lc == 0 && node->layers[0].num_links < HNSW_M/2) {
|
if (lc == 0 && node->layers[0].num_links < index->M/2) {
|
||||||
select_neighbors(index, ctx->level_queues[lc], node, lc,
|
select_neighbors(index, ctx->level_queues[lc], node, lc,
|
||||||
HNSW_M, 1);
|
index->M, 1);
|
||||||
|
|
||||||
/* Still too few connections? Let's go to
|
/* Still too few connections? Let's go to
|
||||||
* aggressiveness level '2' in linking strategy. */
|
* aggressiveness level '2' in linking strategy. */
|
||||||
if (node->layers[0].num_links < HNSW_M/4) {
|
if (node->layers[0].num_links < index->M/4) {
|
||||||
select_neighbors(index, ctx->level_queues[lc], node, lc,
|
select_neighbors(index, ctx->level_queues[lc], node, lc,
|
||||||
HNSW_M/4, 2);
|
index->M/4, 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
8
hnsw.h
8
hnsw.h
|
@ -27,7 +27,8 @@ typedef struct {
|
||||||
* reallocate the node in very particular
|
* reallocate the node in very particular
|
||||||
* conditions in order to allow linking of
|
* conditions in order to allow linking of
|
||||||
* new inserted nodes, so this may change
|
* new inserted nodes, so this may change
|
||||||
* dynamically for a small set of nodes. */
|
* dynamically and be > M*2 for a small set of
|
||||||
|
* nodes. */
|
||||||
float worst_distance; /* Distance to the worst neighbor */
|
float worst_distance; /* Distance to the worst neighbor */
|
||||||
uint32_t worst_idx; /* Index of the worst neighbor */
|
uint32_t worst_idx; /* Index of the worst neighbor */
|
||||||
} hnswNodeLayer;
|
} hnswNodeLayer;
|
||||||
|
@ -74,6 +75,9 @@ typedef struct hnswCursor {
|
||||||
/* Main HNSW index structure */
|
/* Main HNSW index structure */
|
||||||
typedef struct HNSW {
|
typedef struct HNSW {
|
||||||
hnswNode *enter_point; /* Entry point for the graph */
|
hnswNode *enter_point; /* Entry point for the graph */
|
||||||
|
uint32_t M; /* M as in the paper: layer 0 has M*2 max
|
||||||
|
neighbors (M populated at insertion time)
|
||||||
|
while all the other layers have M neighbors. */
|
||||||
uint32_t max_level; /* Current maximum level in the graph */
|
uint32_t max_level; /* Current maximum level in the graph */
|
||||||
uint32_t vector_dim; /* Dimensionality of stored vectors */
|
uint32_t vector_dim; /* Dimensionality of stored vectors */
|
||||||
uint64_t node_count; /* Total number of nodes */
|
uint64_t node_count; /* Total number of nodes */
|
||||||
|
@ -110,7 +114,7 @@ typedef struct hnswSerNode {
|
||||||
typedef struct InsertContext InsertContext;
|
typedef struct InsertContext InsertContext;
|
||||||
|
|
||||||
/* Core HNSW functions */
|
/* Core HNSW functions */
|
||||||
HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type);
|
HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type, uint32_t m);
|
||||||
void hnsw_free(HNSW *index,void(*free_value)(void*value));
|
void hnsw_free(HNSW *index,void(*free_value)(void*value));
|
||||||
void hnsw_node_free(hnswNode *node);
|
void hnsw_node_free(hnswNode *node);
|
||||||
void hnsw_print_stats(HNSW *index);
|
void hnsw_print_stats(HNSW *index);
|
||||||
|
|
2
vset.c
2
vset.c
|
@ -106,7 +106,7 @@ struct vsetObject *createVectorSetObject(unsigned int dim, uint32_t quant_type)
|
||||||
if (!o) return NULL;
|
if (!o) return NULL;
|
||||||
|
|
||||||
o->id = VectorSetTypeNextId++;
|
o->id = VectorSetTypeNextId++;
|
||||||
o->hnsw = hnsw_new(dim,quant_type);
|
o->hnsw = hnsw_new(dim,quant_type,0);
|
||||||
if (!o->hnsw) {
|
if (!o->hnsw) {
|
||||||
RedisModule_Free(o);
|
RedisModule_Free(o);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
4
w2v.c
4
w2v.c
|
@ -26,7 +26,7 @@ uint64_t ms_time(void) {
|
||||||
/* Example usage in main() */
|
/* Example usage in main() */
|
||||||
int w2v_single_thread(int quantization, uint64_t numele, int massdel, int recall) {
|
int w2v_single_thread(int quantization, uint64_t numele, int massdel, int recall) {
|
||||||
/* Create index */
|
/* Create index */
|
||||||
HNSW *index = hnsw_new(300, quantization);
|
HNSW *index = hnsw_new(300, quantization, 0);
|
||||||
float v[300];
|
float v[300];
|
||||||
uint16_t wlen;
|
uint16_t wlen;
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ int w2v_multi_thread(int numthreads, int quantization, uint64_t numele) {
|
||||||
/* Create index */
|
/* Create index */
|
||||||
struct threadContext ctx;
|
struct threadContext ctx;
|
||||||
|
|
||||||
ctx.index = hnsw_new(300,quantization);
|
ctx.index = hnsw_new(300,quantization,0);
|
||||||
|
|
||||||
ctx.fp = fopen("word2vec.bin","rb");
|
ctx.fp = fopen("word2vec.bin","rb");
|
||||||
if (ctx.fp == NULL) {
|
if (ctx.fp == NULL) {
|
||||||
|
|
Loading…
Reference in New Issue