redis/tests/reduce.py

from test import TestCase, fill_redis_with_vectors, generate_random_vector

class Reduce(TestCase):
    def getname(self):
        return "Dimension Reduction"

    def estimated_runtime(self):
        return 0.2

    def test(self):
        original_dim = 100
        reduced_dim = 80
        count = 1000
        k = 50  # Number of nearest neighbors to check

        # Fill Redis with vectors using REDUCE and get reference data
        data = fill_redis_with_vectors(self.redis, self.test_key, count, original_dim, reduced_dim)

        # Verify dimension is reduced
        dim = self.redis.execute_command('VDIM', self.test_key)
        assert dim == reduced_dim, f"Expected dimension {reduced_dim}, got {dim}"

        # Generate query vector and get nearest neighbors using Redis
        query_vec = generate_random_vector(original_dim)
        redis_raw = self.redis.execute_command('VSIM', self.test_key, 'VALUES',
                                             original_dim, *[str(x) for x in query_vec],
                                             'COUNT', k, 'WITHSCORES')

        # Convert Redis results to dict
        redis_results = {}
        for i in range(0, len(redis_raw), 2):
            key = redis_raw[i].decode()
            score = float(redis_raw[i+1])
            redis_results[key] = score

        # Get results from linear scan with original vectors
        linear_results = data.find_k_nearest(query_vec, k)
        linear_items = {name: score for name, score in linear_results}

        # Compare overlap between reduced and non-reduced results
        redis_set = set(redis_results.keys())
        linear_set = set(linear_items.keys())
        overlap = len(redis_set & linear_set)
        overlap_ratio = overlap / k

        # With random projection, we expect some loss of accuracy but should
        # maintain at least some similarity structure.
        # Note that gaussian distribution is the worse with this test, so
        # in real world practice, things will be better.
        min_expected_overlap = 0.1  # At least 10% overlap in top-k
        assert overlap_ratio >= min_expected_overlap, \
            f"Dimension reduction lost too much structure. Only {overlap_ratio*100:.1f}% overlap in top {k}"

        # For items that appear in both results, scores should be reasonably correlated
        common_items = redis_set & linear_set
        for item in common_items:
            redis_score = redis_results[item]
            linear_score = linear_items[item]
            # Allow for some deviation due to dimensionality reduction
            assert abs(redis_score - linear_score) < 0.2, \
                f"Score mismatch too high for {item}: Redis={redis_score:.3f} Linear={linear_score:.3f}"

        # If test fails, print comparison for debugging
        if overlap_ratio < min_expected_overlap:
            print("\nLow overlap in results. Details:")
            print("\nTop results from linear scan (original vectors):")
            for name, score in linear_results:
                print(f"{name}: {score:.3f}")
            print("\nTop results from Redis (reduced vectors):")
            for item, score in sorted(redis_results.items(), key=lambda x: x[1], reverse=True):
                print(f"{item}: {score:.3f}")