mirror of https://mirror.osredm.com/root/redis.git
72 lines
3.2 KiB
Python
72 lines
3.2 KiB
Python
from test import TestCase, fill_redis_with_vectors, generate_random_vector
|
|
|
|
class Reduce(TestCase):
|
|
def getname(self):
|
|
return "Dimension Reduction"
|
|
|
|
def estimated_runtime(self):
|
|
return 0.2
|
|
|
|
def test(self):
|
|
original_dim = 100
|
|
reduced_dim = 80
|
|
count = 1000
|
|
k = 50 # Number of nearest neighbors to check
|
|
|
|
# Fill Redis with vectors using REDUCE and get reference data
|
|
data = fill_redis_with_vectors(self.redis, self.test_key, count, original_dim, reduced_dim)
|
|
|
|
# Verify dimension is reduced
|
|
dim = self.redis.execute_command('VDIM', self.test_key)
|
|
assert dim == reduced_dim, f"Expected dimension {reduced_dim}, got {dim}"
|
|
|
|
# Generate query vector and get nearest neighbors using Redis
|
|
query_vec = generate_random_vector(original_dim)
|
|
redis_raw = self.redis.execute_command('VSIM', self.test_key, 'VALUES',
|
|
original_dim, *[str(x) for x in query_vec],
|
|
'COUNT', k, 'WITHSCORES')
|
|
|
|
# Convert Redis results to dict
|
|
redis_results = {}
|
|
for i in range(0, len(redis_raw), 2):
|
|
key = redis_raw[i].decode()
|
|
score = float(redis_raw[i+1])
|
|
redis_results[key] = score
|
|
|
|
# Get results from linear scan with original vectors
|
|
linear_results = data.find_k_nearest(query_vec, k)
|
|
linear_items = {name: score for name, score in linear_results}
|
|
|
|
# Compare overlap between reduced and non-reduced results
|
|
redis_set = set(redis_results.keys())
|
|
linear_set = set(linear_items.keys())
|
|
overlap = len(redis_set & linear_set)
|
|
overlap_ratio = overlap / k
|
|
|
|
# With random projection, we expect some loss of accuracy but should
|
|
# maintain at least some similarity structure.
|
|
# Note that gaussian distribution is the worse with this test, so
|
|
# in real world practice, things will be better.
|
|
min_expected_overlap = 0.1 # At least 10% overlap in top-k
|
|
assert overlap_ratio >= min_expected_overlap, \
|
|
f"Dimension reduction lost too much structure. Only {overlap_ratio*100:.1f}% overlap in top {k}"
|
|
|
|
# For items that appear in both results, scores should be reasonably correlated
|
|
common_items = redis_set & linear_set
|
|
for item in common_items:
|
|
redis_score = redis_results[item]
|
|
linear_score = linear_items[item]
|
|
# Allow for some deviation due to dimensionality reduction
|
|
assert abs(redis_score - linear_score) < 0.2, \
|
|
f"Score mismatch too high for {item}: Redis={redis_score:.3f} Linear={linear_score:.3f}"
|
|
|
|
# If test fails, print comparison for debugging
|
|
if overlap_ratio < min_expected_overlap:
|
|
print("\nLow overlap in results. Details:")
|
|
print("\nTop results from linear scan (original vectors):")
|
|
for name, score in linear_results:
|
|
print(f"{name}: {score:.3f}")
|
|
print("\nTop results from Redis (reduced vectors):")
|
|
for item, score in sorted(redis_results.items(), key=lambda x: x[1], reverse=True):
|
|
print(f"{item}: {score:.3f}")
|