mirror of https://mirror.osredm.com/root/redis.git
Test: FILTER integration tests, work in progress.
This commit is contained in:
parent
1cb927aef6
commit
c8b6cbc6e1
|
@ -0,0 +1,555 @@
|
|||
from test import TestCase, generate_random_vector
|
||||
import struct
|
||||
import random
|
||||
import math
|
||||
import json
|
||||
import time
|
||||
|
||||
class VSIMFilterAdvanced(TestCase):
|
||||
def getname(self):
|
||||
return "VSIM FILTER comprehensive functionality testing"
|
||||
|
||||
def estimated_runtime(self):
|
||||
return 15 # This test might take up to 15 seconds for the large dataset
|
||||
|
||||
def setup(self):
|
||||
super().setup()
|
||||
self.dim = 32 # Vector dimension
|
||||
self.count = 5000 # Number of vectors for large tests
|
||||
self.small_count = 50 # Number of vectors for small/quick tests
|
||||
|
||||
# Categories for attributes
|
||||
self.categories = ["electronics", "furniture", "clothing", "books", "food"]
|
||||
self.cities = ["New York", "London", "Tokyo", "Paris", "Berlin", "Sydney", "Toronto", "Singapore"]
|
||||
self.price_ranges = [(10, 50), (50, 200), (200, 1000), (1000, 5000)]
|
||||
self.years = list(range(2000, 2025))
|
||||
|
||||
def create_attributes(self, index):
|
||||
"""Create realistic attributes for a vector"""
|
||||
category = random.choice(self.categories)
|
||||
city = random.choice(self.cities)
|
||||
min_price, max_price = random.choice(self.price_ranges)
|
||||
price = round(random.uniform(min_price, max_price), 2)
|
||||
year = random.choice(self.years)
|
||||
in_stock = random.random() > 0.3 # 70% chance of being in stock
|
||||
rating = round(random.uniform(1, 5), 1)
|
||||
views = int(random.expovariate(1/1000)) # Exponential distribution for page views
|
||||
tags = random.sample(["popular", "sale", "new", "limited", "exclusive", "clearance"],
|
||||
k=random.randint(0, 3))
|
||||
|
||||
# Add some specific patterns for testing
|
||||
# Every 10th item has a specific property combination for testing
|
||||
is_premium = (index % 10 == 0)
|
||||
|
||||
# Create attributes dictionary
|
||||
attrs = {
|
||||
"id": index,
|
||||
"category": category,
|
||||
"location": city,
|
||||
"price": price,
|
||||
"year": year,
|
||||
"in_stock": in_stock,
|
||||
"rating": rating,
|
||||
"views": views,
|
||||
"tags": tags
|
||||
}
|
||||
|
||||
if is_premium:
|
||||
attrs["is_premium"] = True
|
||||
attrs["special_features"] = ["premium", "warranty", "support"]
|
||||
|
||||
# Add sub-categories for more complex filters
|
||||
if category == "electronics":
|
||||
attrs["subcategory"] = random.choice(["phones", "computers", "cameras", "audio"])
|
||||
elif category == "furniture":
|
||||
attrs["subcategory"] = random.choice(["chairs", "tables", "sofas", "beds"])
|
||||
elif category == "clothing":
|
||||
attrs["subcategory"] = random.choice(["shirts", "pants", "dresses", "shoes"])
|
||||
|
||||
# Add some intentionally missing fields for testing
|
||||
if random.random() > 0.9: # 10% chance of missing price
|
||||
del attrs["price"]
|
||||
|
||||
# Some items have promotion field
|
||||
if random.random() > 0.7: # 30% chance of having a promotion
|
||||
attrs["promotion"] = random.choice(["discount", "bundle", "gift"])
|
||||
|
||||
# Create invalid JSON for a small percentage of vectors
|
||||
if random.random() > 0.98: # 2% chance of having invalid JSON
|
||||
return "{{invalid json}}"
|
||||
|
||||
return json.dumps(attrs)
|
||||
|
||||
def create_vectors_with_attributes(self, key, count):
|
||||
"""Create vectors and add attributes to them"""
|
||||
vectors = []
|
||||
names = []
|
||||
attribute_map = {} # To store attributes for verification
|
||||
|
||||
# Create vectors
|
||||
for i in range(count):
|
||||
vec = generate_random_vector(self.dim)
|
||||
vectors.append(vec)
|
||||
name = f"{key}:item:{i}"
|
||||
names.append(name)
|
||||
|
||||
# Add to Redis
|
||||
vec_bytes = struct.pack(f'{self.dim}f', *vec)
|
||||
self.redis.execute_command('VADD', key, 'FP32', vec_bytes, name)
|
||||
|
||||
# Create and add attributes
|
||||
attrs = self.create_attributes(i)
|
||||
self.redis.execute_command('VSETATTR', key, name, attrs)
|
||||
|
||||
# Store attributes for later verification
|
||||
try:
|
||||
attribute_map[name] = json.loads(attrs) if '{' in attrs else None
|
||||
except json.JSONDecodeError:
|
||||
attribute_map[name] = None
|
||||
|
||||
return vectors, names, attribute_map
|
||||
|
||||
def filter_linear_search(self, vectors, names, query_vector, filter_expr, attribute_map, k=10):
|
||||
"""Perform a linear search with filtering for verification"""
|
||||
similarities = []
|
||||
query_norm = math.sqrt(sum(x*x for x in query_vector))
|
||||
|
||||
if query_norm == 0:
|
||||
return []
|
||||
|
||||
for i, vec in enumerate(vectors):
|
||||
name = names[i]
|
||||
attributes = attribute_map.get(name)
|
||||
|
||||
# Skip if doesn't match filter
|
||||
if not self.matches_filter(attributes, filter_expr):
|
||||
continue
|
||||
|
||||
vec_norm = math.sqrt(sum(x*x for x in vec))
|
||||
if vec_norm == 0:
|
||||
continue
|
||||
|
||||
dot_product = sum(a*b for a,b in zip(query_vector, vec))
|
||||
cosine_sim = dot_product / (query_norm * vec_norm)
|
||||
distance = 1.0 - cosine_sim
|
||||
redis_similarity = 1.0 - (distance/2.0)
|
||||
similarities.append((name, redis_similarity))
|
||||
|
||||
similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
return similarities[:k]
|
||||
|
||||
def matches_filter(self, attributes, filter_expr):
|
||||
"""Simple filter matching for verification - handles basic expressions"""
|
||||
if attributes is None:
|
||||
return False # No attributes or invalid JSON
|
||||
|
||||
# This is a simplified implementation - in a real test we would implement
|
||||
# a proper expression parser for verification, but for this example we'll
|
||||
# use a set of common patterns that match our test cases
|
||||
|
||||
if filter_expr == '.category == "electronics"':
|
||||
return attributes.get('category') == 'electronics'
|
||||
elif filter_expr == '.price > 1000':
|
||||
return attributes.get('price', 0) > 1000
|
||||
elif filter_expr == '.in_stock':
|
||||
return attributes.get('in_stock', False)
|
||||
elif filter_expr == '.rating >= 4':
|
||||
return attributes.get('rating', 0) >= 4
|
||||
elif filter_expr == '.category == "electronics" and .price < 500':
|
||||
return (attributes.get('category') == 'electronics' and
|
||||
attributes.get('price', float('inf')) < 500)
|
||||
elif filter_expr == '.is_premium':
|
||||
return attributes.get('is_premium', False)
|
||||
elif filter_expr == '.price > 100 and .price < 1000':
|
||||
price = attributes.get('price', 0)
|
||||
return price > 100 and price < 1000
|
||||
|
||||
# Default case - we can't parse this filter
|
||||
return True
|
||||
|
||||
def safe_decode(self,item):
|
||||
return item.decode() if isinstance(item, bytes) else item
|
||||
|
||||
def calculate_recall(self, redis_results, linear_results, k=10):
|
||||
"""Calculate recall (percentage of correct results retrieved)"""
|
||||
redis_set = set(self.safe_decode(item) for item in redis_results)
|
||||
linear_set = set(item[0] for item in linear_results[:k])
|
||||
|
||||
if not linear_set:
|
||||
return 1.0 # If no linear results, consider it perfect recall
|
||||
|
||||
intersection = redis_set.intersection(linear_set)
|
||||
return len(intersection) / len(linear_set)
|
||||
|
||||
def test_recall_with_filter(self, filter_expr, ef=500, filter_ef=None):
|
||||
"""Test recall for a given filter expression"""
|
||||
# Create query vector
|
||||
query_vec = generate_random_vector(self.dim)
|
||||
|
||||
# First, get ground truth using linear scan
|
||||
linear_results = self.filter_linear_search(
|
||||
self.vectors, self.names, query_vec, filter_expr, self.attribute_map, k=50)
|
||||
|
||||
# Calculate true selectivity from ground truth
|
||||
true_selectivity = len(linear_results) / len(self.names) if self.names else 0
|
||||
|
||||
# Perform Redis search with filter
|
||||
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
|
||||
cmd_args.extend([str(x) for x in query_vec])
|
||||
cmd_args.extend(['COUNT', 50, 'WITHSCORES', 'EF', ef, 'FILTER', filter_expr])
|
||||
if filter_ef:
|
||||
cmd_args.extend(['FILTER-EF', filter_ef])
|
||||
|
||||
start_time = time.time()
|
||||
redis_results = self.redis.execute_command(*cmd_args)
|
||||
query_time = time.time() - start_time
|
||||
|
||||
# Convert Redis results to dict
|
||||
redis_items = {}
|
||||
for i in range(0, len(redis_results), 2):
|
||||
key = redis_results[i].decode() if isinstance(redis_results[i], bytes) else redis_results[i]
|
||||
score = float(redis_results[i+1])
|
||||
redis_items[key] = score
|
||||
|
||||
# Calculate metrics
|
||||
recall = self.calculate_recall(redis_items.keys(), linear_results)
|
||||
selectivity = len(redis_items) / len(self.names) if redis_items else 0
|
||||
|
||||
# Compare against the true selectivity from linear scan
|
||||
assert abs(selectivity - true_selectivity) < 0.1, \
|
||||
f"Redis selectivity {selectivity:.3f} differs significantly from ground truth {true_selectivity:.3f}"
|
||||
|
||||
# We expect high recall for standard parameters
|
||||
if ef >= 500 and (filter_ef is None or filter_ef >= 1000):
|
||||
assert recall >= 0.7, \
|
||||
f"Low recall {recall:.2f} for filter '{filter_expr}'"
|
||||
|
||||
return recall, selectivity, query_time, len(redis_items)
|
||||
|
||||
def test(self):
|
||||
print(f"\nRunning comprehensive VSIM FILTER tests...")
|
||||
|
||||
# Create a larger dataset for testing
|
||||
print(f"Creating dataset with {self.count} vectors and attributes...")
|
||||
self.vectors, self.names, self.attribute_map = self.create_vectors_with_attributes(
|
||||
self.test_key, self.count)
|
||||
|
||||
# ==== 1. Recall and Precision Testing ====
|
||||
print("Testing recall for various filters...")
|
||||
|
||||
# Test basic filters with different selectivity
|
||||
results = {}
|
||||
results["category"] = self.test_recall_with_filter('.category == "electronics"')
|
||||
results["price_high"] = self.test_recall_with_filter('.price > 1000')
|
||||
results["in_stock"] = self.test_recall_with_filter('.in_stock')
|
||||
results["rating"] = self.test_recall_with_filter('.rating >= 4')
|
||||
results["complex1"] = self.test_recall_with_filter('.category == "electronics" and .price < 500')
|
||||
|
||||
print("Filter | Recall | Selectivity | Time (ms) | Results")
|
||||
print("----------------------------------------------------")
|
||||
for name, (recall, selectivity, time_ms, count) in results.items():
|
||||
print(f"{name:7} | {recall:.3f} | {selectivity:.3f} | {time_ms*1000:.1f} | {count}")
|
||||
|
||||
# ==== 2. Filter Selectivity Performance ====
|
||||
print("\nTesting filter selectivity performance...")
|
||||
|
||||
# High selectivity (very few matches)
|
||||
high_sel_recall, _, high_sel_time, _ = self.test_recall_with_filter('.is_premium')
|
||||
|
||||
# Medium selectivity
|
||||
med_sel_recall, _, med_sel_time, _ = self.test_recall_with_filter('.price > 100 and .price < 1000')
|
||||
|
||||
# Low selectivity (many matches)
|
||||
low_sel_recall, _, low_sel_time, _ = self.test_recall_with_filter('.year > 2000')
|
||||
|
||||
print(f"High selectivity recall: {high_sel_recall:.3f}, time: {high_sel_time*1000:.1f}ms")
|
||||
print(f"Med selectivity recall: {med_sel_recall:.3f}, time: {med_sel_time*1000:.1f}ms")
|
||||
print(f"Low selectivity recall: {low_sel_recall:.3f}, time: {low_sel_time*1000:.1f}ms")
|
||||
|
||||
# ==== 3. FILTER-EF Parameter Testing ====
|
||||
print("\nTesting FILTER-EF parameter...")
|
||||
|
||||
# Test with different FILTER-EF values
|
||||
filter_expr = '.category == "electronics" and .price > 200'
|
||||
ef_values = [100, 500, 2000, 5000]
|
||||
|
||||
print("FILTER-EF | Recall | Time (ms)")
|
||||
print("-----------------------------")
|
||||
for filter_ef in ef_values:
|
||||
recall, _, query_time, _ = self.test_recall_with_filter(
|
||||
filter_expr, ef=500, filter_ef=filter_ef)
|
||||
print(f"{filter_ef:9} | {recall:.3f} | {query_time*1000:.1f}")
|
||||
|
||||
# Assert that higher FILTER-EF generally gives better recall
|
||||
low_ef_recall, _, _, _ = self.test_recall_with_filter(filter_expr, filter_ef=100)
|
||||
high_ef_recall, _, _, _ = self.test_recall_with_filter(filter_expr, filter_ef=5000)
|
||||
|
||||
# This might not always be true due to randomness, but generally holds
|
||||
# We use a softer assertion to avoid flaky tests
|
||||
assert high_ef_recall >= low_ef_recall * 0.8, \
|
||||
f"Higher FILTER-EF should generally give better recall: {high_ef_recall:.3f} vs {low_ef_recall:.3f}"
|
||||
|
||||
# ==== 4. Complex Filter Expressions ====
|
||||
print("\nTesting complex filter expressions...")
|
||||
|
||||
# Test a variety of complex expressions
|
||||
complex_filters = [
|
||||
'.price > 100 and (.category == "electronics" or .category == "furniture")',
|
||||
'(.rating > 4 and .in_stock) or (.price < 50 and .views > 1000)',
|
||||
'.category in ["electronics", "clothing"] and .price > 200 and .rating >= 3',
|
||||
'(.category == "electronics" and .subcategory == "phones") or (.category == "furniture" and .price > 1000)',
|
||||
'.year > 2010 and !(.price < 100) and .in_stock'
|
||||
]
|
||||
|
||||
print("Expression | Results | Time (ms)")
|
||||
print("-----------------------------")
|
||||
for i, expr in enumerate(complex_filters):
|
||||
try:
|
||||
_, _, query_time, result_count = self.test_recall_with_filter(expr)
|
||||
print(f"Complex {i+1} | {result_count:7} | {query_time*1000:.1f}")
|
||||
except Exception as e:
|
||||
print(f"Complex {i+1} | Error: {str(e)}")
|
||||
|
||||
# ==== 5. Attribute Type Testing ====
|
||||
print("\nTesting different attribute types...")
|
||||
|
||||
type_filters = [
|
||||
('.price > 500', "Numeric"),
|
||||
('.category == "books"', "String equality"),
|
||||
('.in_stock', "Boolean"),
|
||||
('.tags in ["sale", "new"]', "Array membership"),
|
||||
('.rating * 2 > 8', "Arithmetic")
|
||||
]
|
||||
|
||||
for expr, type_name in type_filters:
|
||||
try:
|
||||
_, _, query_time, result_count = self.test_recall_with_filter(expr)
|
||||
print(f"{type_name:16} | {expr:30} | {result_count:5} results | {query_time*1000:.1f}ms")
|
||||
except Exception as e:
|
||||
print(f"{type_name:16} | {expr:30} | Error: {str(e)}")
|
||||
|
||||
# ==== 6. Filter + Count Interaction ====
|
||||
print("\nTesting COUNT parameter with filters...")
|
||||
|
||||
filter_expr = '.category == "electronics"'
|
||||
counts = [5, 20, 100]
|
||||
|
||||
for count in counts:
|
||||
query_vec = generate_random_vector(self.dim)
|
||||
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
|
||||
cmd_args.extend([str(x) for x in query_vec])
|
||||
cmd_args.extend(['COUNT', count, 'WITHSCORES', 'FILTER', filter_expr])
|
||||
|
||||
results = self.redis.execute_command(*cmd_args)
|
||||
result_count = len(results) // 2 # Divide by 2 because WITHSCORES returns pairs
|
||||
|
||||
# We expect result count to be at most the requested count
|
||||
assert result_count <= count, f"Got {result_count} results with COUNT {count}"
|
||||
print(f"COUNT {count:3} | Got {result_count:3} results")
|
||||
|
||||
# ==== 7. Edge Cases ====
|
||||
print("\nTesting edge cases...")
|
||||
|
||||
# Test with no matching items
|
||||
no_match_expr = '.category == "nonexistent_category"'
|
||||
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', self.dim,
|
||||
*[str(x) for x in generate_random_vector(self.dim)],
|
||||
'FILTER', no_match_expr)
|
||||
assert len(results) == 0, f"Expected 0 results for non-matching filter, got {len(results)}"
|
||||
print(f"No matching items: {len(results)} results (expected 0)")
|
||||
|
||||
# Test with invalid filter syntax
|
||||
try:
|
||||
self.redis.execute_command('VSIM', self.test_key, 'VALUES', self.dim,
|
||||
*[str(x) for x in generate_random_vector(self.dim)],
|
||||
'FILTER', '.category === "books"') # Triple equals is invalid
|
||||
assert False, "Expected error for invalid filter syntax"
|
||||
except:
|
||||
print("Invalid filter syntax correctly raised an error")
|
||||
|
||||
# Test with extremely long complex expression
|
||||
long_expr = ' and '.join([f'.rating > {i/10}' for i in range(10)])
|
||||
try:
|
||||
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', self.dim,
|
||||
*[str(x) for x in generate_random_vector(self.dim)],
|
||||
'FILTER', long_expr)
|
||||
print(f"Long expression: {len(results)} results")
|
||||
except Exception as e:
|
||||
print(f"Long expression error: {str(e)}")
|
||||
|
||||
print("\nComprehensive VSIM FILTER tests completed successfully")
|
||||
|
||||
|
||||
class VSIMFilterSelectivityTest(TestCase):
|
||||
def getname(self):
|
||||
return "VSIM FILTER selectivity performance benchmark"
|
||||
|
||||
def estimated_runtime(self):
|
||||
return 8 # This test might take up to 8 seconds
|
||||
|
||||
def setup(self):
|
||||
super().setup()
|
||||
self.dim = 32
|
||||
self.count = 10000
|
||||
self.test_key = f"{self.test_key}:selectivity" # Use a different key
|
||||
|
||||
def create_vector_with_age_attribute(self, name, age):
|
||||
"""Create a vector with a specific age attribute"""
|
||||
vec = generate_random_vector(self.dim)
|
||||
vec_bytes = struct.pack(f'{self.dim}f', *vec)
|
||||
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, name)
|
||||
self.redis.execute_command('VSETATTR', self.test_key, name, json.dumps({"age": age}))
|
||||
|
||||
def test(self):
|
||||
print("\nRunning VSIM FILTER selectivity benchmark...")
|
||||
|
||||
# Create a dataset where we control the exact selectivity
|
||||
print(f"Creating controlled dataset with {self.count} vectors...")
|
||||
|
||||
# Create vectors with age attributes from 1 to 100
|
||||
for i in range(self.count):
|
||||
age = (i % 100) + 1 # Ages from 1 to 100
|
||||
name = f"{self.test_key}:item:{i}"
|
||||
self.create_vector_with_age_attribute(name, age)
|
||||
|
||||
# Create a query vector
|
||||
query_vec = generate_random_vector(self.dim)
|
||||
|
||||
# Test filters with different selectivities
|
||||
selectivities = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.99]
|
||||
results = []
|
||||
|
||||
print("\nSelectivity | Filter | Results | Time (ms)")
|
||||
print("--------------------------------------------------")
|
||||
|
||||
for target_selectivity in selectivities:
|
||||
# Calculate age threshold for desired selectivity
|
||||
# For example, age <= 10 gives 10% selectivity
|
||||
age_threshold = int(target_selectivity * 100)
|
||||
filter_expr = f'.age <= {age_threshold}'
|
||||
|
||||
# Run query and measure time
|
||||
start_time = time.time()
|
||||
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
|
||||
cmd_args.extend([str(x) for x in query_vec])
|
||||
cmd_args.extend(['COUNT', 100, 'FILTER', filter_expr])
|
||||
|
||||
results = self.redis.execute_command(*cmd_args)
|
||||
query_time = time.time() - start_time
|
||||
|
||||
actual_selectivity = len(results) / min(100, int(target_selectivity * self.count))
|
||||
print(f"{target_selectivity:.2f} | {filter_expr:15} | {len(results):7} | {query_time*1000:.1f}")
|
||||
|
||||
# Add assertion to ensure reasonable performance for different selectivities
|
||||
# For very selective queries (1%), we might need more exploration
|
||||
if target_selectivity <= 0.05:
|
||||
# For very selective queries, ensure we can find some results
|
||||
assert len(results) > 0, f"No results found for {filter_expr}"
|
||||
else:
|
||||
# For less selective queries, performance should be reasonable
|
||||
assert query_time < 1.0, f"Query too slow: {query_time:.3f}s for {filter_expr}"
|
||||
|
||||
print("\nSelectivity benchmark completed successfully")
|
||||
|
||||
|
||||
class VSIMFilterComparisonTest(TestCase):
|
||||
def getname(self):
|
||||
return "VSIM FILTER EF parameter comparison"
|
||||
|
||||
def estimated_runtime(self):
|
||||
return 8 # This test might take up to 8 seconds
|
||||
|
||||
def setup(self):
|
||||
super().setup()
|
||||
self.dim = 32
|
||||
self.count = 5000
|
||||
self.test_key = f"{self.test_key}:efparams" # Use a different key
|
||||
|
||||
def create_dataset(self):
|
||||
"""Create a dataset with specific attribute patterns for testing FILTER-EF"""
|
||||
vectors = []
|
||||
names = []
|
||||
|
||||
# Create vectors with category and quality score attributes
|
||||
for i in range(self.count):
|
||||
vec = generate_random_vector(self.dim)
|
||||
name = f"{self.test_key}:item:{i}"
|
||||
|
||||
# Add vector to Redis
|
||||
vec_bytes = struct.pack(f'{self.dim}f', *vec)
|
||||
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, name)
|
||||
|
||||
# Create attributes - we want a very selective filter
|
||||
# Only 2% of items have category=premium AND quality>90
|
||||
category = "premium" if random.random() < 0.1 else random.choice(["standard", "economy", "basic"])
|
||||
quality = random.randint(1, 100)
|
||||
|
||||
attrs = {
|
||||
"id": i,
|
||||
"category": category,
|
||||
"quality": quality
|
||||
}
|
||||
|
||||
self.redis.execute_command('VSETATTR', self.test_key, name, json.dumps(attrs))
|
||||
vectors.append(vec)
|
||||
names.append(name)
|
||||
|
||||
return vectors, names
|
||||
|
||||
def test(self):
|
||||
print("\nRunning VSIM FILTER-EF parameter comparison...")
|
||||
|
||||
# Create dataset
|
||||
vectors, names = self.create_dataset()
|
||||
|
||||
# Create a selective filter that matches ~2% of items
|
||||
filter_expr = '.category == "premium" and .quality > 90'
|
||||
|
||||
# Create query vector
|
||||
query_vec = generate_random_vector(self.dim)
|
||||
|
||||
# Test different FILTER-EF values
|
||||
ef_values = [50, 100, 500, 1000, 5000]
|
||||
results = []
|
||||
|
||||
print("\nFILTER-EF | Results | Time (ms) | Notes")
|
||||
print("---------------------------------------")
|
||||
|
||||
baseline_count = None
|
||||
|
||||
for ef in ef_values:
|
||||
# Run query and measure time
|
||||
start_time = time.time()
|
||||
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
|
||||
cmd_args.extend([str(x) for x in query_vec])
|
||||
cmd_args.extend(['COUNT', 100, 'FILTER', filter_expr, 'FILTER-EF', ef])
|
||||
|
||||
query_results = self.redis.execute_command(*cmd_args)
|
||||
query_time = time.time() - start_time
|
||||
|
||||
# Set baseline for comparison
|
||||
if baseline_count is None:
|
||||
baseline_count = len(query_results)
|
||||
|
||||
recall_rate = len(query_results) / max(1, baseline_count) if baseline_count > 0 else 1.0
|
||||
|
||||
notes = ""
|
||||
if ef == 5000:
|
||||
notes = "Baseline"
|
||||
elif recall_rate < 0.5:
|
||||
notes = "Low recall!"
|
||||
|
||||
print(f"{ef:9} | {len(query_results):7} | {query_time*1000:.1f} | {notes}")
|
||||
results.append((ef, len(query_results), query_time))
|
||||
|
||||
# If we have enough results at highest EF, check that recall improves with higher EF
|
||||
if results[-1][1] >= 5: # At least 5 results for highest EF
|
||||
# Extract result counts
|
||||
result_counts = [r[1] for r in results]
|
||||
|
||||
# The last result (highest EF) should typically find more results than the first (lowest EF)
|
||||
# but we use a soft assertion to avoid flaky tests
|
||||
assert result_counts[-1] >= result_counts[0], \
|
||||
f"Higher FILTER-EF should find at least as many results: {result_counts[-1]} vs {result_counts[0]}"
|
||||
|
||||
print("\nFILTER-EF parameter comparison completed successfully")
|
Loading…
Reference in New Issue