redis/tests/reduce.py

72 lines
3.2 KiB
Python

from test import TestCase, fill_redis_with_vectors, generate_random_vector
class Reduce(TestCase):
def getname(self):
return "Dimension Reduction"
def estimated_runtime(self):
return 0.2
def test(self):
original_dim = 100
reduced_dim = 80
count = 1000
k = 50 # Number of nearest neighbors to check
# Fill Redis with vectors using REDUCE and get reference data
data = fill_redis_with_vectors(self.redis, self.test_key, count, original_dim, reduced_dim)
# Verify dimension is reduced
dim = self.redis.execute_command('VDIM', self.test_key)
assert dim == reduced_dim, f"Expected dimension {reduced_dim}, got {dim}"
# Generate query vector and get nearest neighbors using Redis
query_vec = generate_random_vector(original_dim)
redis_raw = self.redis.execute_command('VSIM', self.test_key, 'VALUES',
original_dim, *[str(x) for x in query_vec],
'COUNT', k, 'WITHSCORES')
# Convert Redis results to dict
redis_results = {}
for i in range(0, len(redis_raw), 2):
key = redis_raw[i].decode()
score = float(redis_raw[i+1])
redis_results[key] = score
# Get results from linear scan with original vectors
linear_results = data.find_k_nearest(query_vec, k)
linear_items = {name: score for name, score in linear_results}
# Compare overlap between reduced and non-reduced results
redis_set = set(redis_results.keys())
linear_set = set(linear_items.keys())
overlap = len(redis_set & linear_set)
overlap_ratio = overlap / k
# With random projection, we expect some loss of accuracy but should
# maintain at least some similarity structure.
# Note that gaussian distribution is the worse with this test, so
# in real world practice, things will be better.
min_expected_overlap = 0.1 # At least 10% overlap in top-k
assert overlap_ratio >= min_expected_overlap, \
f"Dimension reduction lost too much structure. Only {overlap_ratio*100:.1f}% overlap in top {k}"
# For items that appear in both results, scores should be reasonably correlated
common_items = redis_set & linear_set
for item in common_items:
redis_score = redis_results[item]
linear_score = linear_items[item]
# Allow for some deviation due to dimensionality reduction
assert abs(redis_score - linear_score) < 0.2, \
f"Score mismatch too high for {item}: Redis={redis_score:.3f} Linear={linear_score:.3f}"
# If test fails, print comparison for debugging
if overlap_ratio < min_expected_overlap:
print("\nLow overlap in results. Details:")
print("\nTop results from linear scan (original vectors):")
for name, score in linear_results:
print(f"{name}: {score:.3f}")
print("\nTop results from Redis (reduced vectors):")
for item, score in sorted(redis_results.items(), key=lambda x: x[1], reverse=True):
print(f"{item}: {score:.3f}")