Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 73 additions & 7 deletions python/tests/detail/distance_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,36 @@ def cosine_distance_dense(
quantize_type: QuantizeType = QuantizeType.UNDEFINED,
):
if dtype == DataType.VECTOR_FP16 or quantize_type == QuantizeType.FP16:
vec1 = [np.float16(a) for a in vec1]
vec2 = [np.float16(b) for b in vec2]
# More stable conversion to float16 to avoid numerical issues
vec1 = [float(np.float16(a)) for a in vec1]
vec2 = [float(np.float16(b)) for b in vec2]
elif dtype == DataType.VECTOR_INT8:
# For INT8 vectors, convert to integers for proper calculation
vec1 = [int(round(min(max(val, -128), 127))) for val in vec1] # Clamp to valid INT8 range
vec2 = [int(round(min(max(val, -128), 127))) for val in vec2] # Clamp to valid INT8 range

dot_product = sum(a * b for a, b in zip(vec1, vec2))

magnitude1 = math.sqrt(sum(a * a for a in vec1))
magnitude2 = math.sqrt(sum(b * b for b in vec2))

if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return 1.0 # Zero vector case - maximum distance

return 1 - dot_product / (magnitude1 * magnitude2)
cosine_similarity = dot_product / (magnitude1 * magnitude2)

# Clamp to [-1, 1] range to handle floating-point precision errors
cosine_similarity = max(-1.0, min(1.0, cosine_similarity))

# For identical vectors (within floating point precision), ensure cosine distance is 0.0
# This is especially important for low-precision types which have limited precision
if dtype == DataType.VECTOR_FP16 or quantize_type == QuantizeType.FP16 or dtype == DataType.VECTOR_INT8:
if abs(cosine_similarity - 1.0) < 1e-3: # Handle precision issues for low-precision types
cosine_similarity = 1.0

# Return cosine distance (1 - cosine similarity) to maintain compatibility
# with system internal processing and existing test expectations
return 1.0 - cosine_similarity


def dp_distance_dense(
Expand All @@ -83,7 +101,14 @@ def dp_distance_dense(
quantize_type: QuantizeType = QuantizeType.UNDEFINED,
):
if dtype == DataType.VECTOR_FP16 or quantize_type == QuantizeType.FP16:
return sum(np.float16(a) * np.float16(b) for a, b in zip(vec1, vec2))
# More stable computation to avoid numerical issues
products = [float(np.float16(a)) * float(np.float16(b)) for a, b in zip(vec1, vec2)]
return sum(products)
elif dtype == DataType.VECTOR_INT8:
# For INT8 vectors, convert to integers for proper calculation
products = [int(round(min(max(a, -128), 127))) * int(round(min(max(b, -128), 127)))
for a, b in zip(vec1, vec2)]
return sum(products)
return sum(a * b for a, b in zip(vec1, vec2))


Expand All @@ -94,8 +119,26 @@ def euclidean_distance_dense(
quantize_type: QuantizeType = QuantizeType.UNDEFINED,
):
if dtype == DataType.VECTOR_FP16 or quantize_type == QuantizeType.FP16:
return sum((np.float16(a) - np.float16(b)) ** 2 for a, b in zip(vec1, vec2))
return sum((a - b) ** 2 for a, b in zip(vec1, vec2))
# Convert to float16 and compute squared differences safely
# Use a more stable computation to avoid overflow
squared_diffs = []
for a, b in zip(vec1, vec2):
diff = np.float16(a) - np.float16(b)
squared_diff = float(diff) * float(diff) # Convert to float for multiplication
squared_diffs.append(squared_diff)
squared_distance = sum(squared_diffs)
elif dtype == DataType.VECTOR_INT8:
# For INT8 vectors, convert to integers and handle potential scaling
# INT8 values might be treated differently in the library implementation
vec1_int = [int(round(min(max(val, -128), 127))) for val in vec1] # Clamp to valid INT8 range
vec2_int = [int(round(min(max(val, -128), 127))) for val in vec2] # Clamp to valid INT8 range
# Use float type to prevent overflow when summing large squared differences
squared_distance = sum(float(a - b) ** 2 for a, b in zip(vec1_int, vec2_int))
else:
squared_distance = sum((a - b) ** 2 for a, b in zip(vec1, vec2))

return squared_distance # Return squared distance for INT8



def distance_dense(
Expand Down Expand Up @@ -123,6 +166,8 @@ def dp_distance_sparse(
):
dot_product = 0.0
for dim in set(vec1.keys()) & set(vec2.keys()):
print("dim,vec1,vec2:\n")
print(dim,vec1,vec2)
if (
data_type == DataType.SPARSE_VECTOR_FP16
or quantize_type == QuantizeType.FP16
Expand Down Expand Up @@ -153,6 +198,27 @@ def distance(
return dp_distance_sparse(vec1, vec2, data_type, quantize_type)
else:
return distance_dense(vec1, vec2, metric, data_type, quantize_type)
def distance_recall(
vec1,
vec2,
metric: MetricType,
data_type: DataType,
quantize_type: QuantizeType = QuantizeType.UNDEFINED,
):
is_sparse = (
data_type == DataType.SPARSE_VECTOR_FP32
or data_type == DataType.SPARSE_VECTOR_FP16
)

if is_sparse:
return dp_distance_sparse(vec1, vec2, data_type, quantize_type)
else:
if data_type in [DataType.VECTOR_FP32, DataType.VECTOR_FP16]:
return distance_dense(vec1, vec2, metric, data_type, quantize_type)
elif data_type in [DataType.VECTOR_INT8] and metric in [MetricType.L2,MetricType.IP]:
return distance_dense(vec1, vec2, metric, data_type, quantize_type)
else:
return dp_distance_dense(vec1, vec2, data_type, quantize_type)


def calculate_rrf_score(rank, k=60):
Expand Down
176 changes: 127 additions & 49 deletions python/tests/detail/doc_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,36 @@

import random
import string
import math


def generate_constant_vector(
i: int, dimension: int, dtype: Literal["int8", "float16", "float32"] = "float32"
):
if dtype == "int8":
vec = [i % 128] * dimension
vec[i % dimension] = (i + 1) % 128
vec = [(i % 127)] * dimension
vec[i % dimension] = ((i + 1) % 127)
else:
vec = [i / 256.0] * dimension
vec[i % dimension] = (i + 1) / 256.0
base_val = (i % 1000) / 256.0
special_val = ((i + 1) % 1000) / 256.0
vec = [base_val] * dimension
vec[i % dimension] = special_val

return vec

def generate_constant_vector_recall(
i: int, dimension: int, dtype: Literal["int8", "float16", "float32"] = "float32"
):
if dtype == "int8":
vec = [(i % 127)] * dimension
vec[i % dimension] = ((i + 1) % 127)
else:
base_val = math.sin((i) * 1000) / 256.0
special_val = math.sin((i+1)*1000) / 256.0
vec = [base_val] * dimension
vec[i % dimension] = special_val

return vec
def generate_sparse_vector(i: int):
return {i: i + 0.1}

Expand Down Expand Up @@ -89,90 +104,153 @@ def generate_vectordict(i: int, schema: CollectionSchema) -> Doc:
raise ValueError(f"Unsupported vector type: {vector.data_type}")
return doc_fields, doc_vectors


def generate_doc(i: int, schema: CollectionSchema) -> Doc:
def generate_vectordict_recall(i: int, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
doc_fields, doc_vectors = generate_vectordict(i, schema)
doc = Doc(id=str(i), fields=doc_fields, vectors=doc_vectors)
return doc


def generate_update_doc(i: int, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
for field in schema.fields:
if field.data_type == DataType.BOOL:
doc_fields[field.name] = (i + 1) % 2 == 0
doc_fields[field.name] = i % 2 == 0
elif field.data_type == DataType.INT32:
doc_fields[field.name] = i + 1
doc_fields[field.name] = i
elif field.data_type == DataType.UINT32:
doc_fields[field.name] = i + 1
doc_fields[field.name] = i
elif field.data_type == DataType.INT64:
doc_fields[field.name] = i + 1
doc_fields[field.name] = i
elif field.data_type == DataType.UINT64:
doc_fields[field.name] = i + 1
doc_fields[field.name] = i
elif field.data_type == DataType.FLOAT:
doc_fields[field.name] = float(i + 1) + 0.1
doc_fields[field.name] = float(i) + 0.1
elif field.data_type == DataType.DOUBLE:
doc_fields[field.name] = float(i) + 0.11
elif field.data_type == DataType.STRING:
doc_fields[field.name] = f"test_{i + 1}"
doc_fields[field.name] = f"test_{i}"
elif field.data_type == DataType.ARRAY_BOOL:
doc_fields[field.name] = [(i + 1) % 2 == 0, (i + 1) % 3 == 0]
doc_fields[field.name] = [i % 2 == 0, i % 3 == 0]
elif field.data_type == DataType.ARRAY_INT32:
doc_fields[field.name] = [i + 1, (i + 1) + 1, (i + 1) + 2]
doc_fields[field.name] = [i, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_UINT32:
doc_fields[field.name] = [i + 1, (i + 1) + 1, (i + 1) + 2]
doc_fields[field.name] = [i, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_INT64:
doc_fields[field.name] = [i + 1, (i + 1) + 1, (i + 1) + 2]
doc_fields[field.name] = [i, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_UINT64:
doc_fields[field.name] = [i + 1, (i + 1) + 1, (i + 1) + 2]
doc_fields[field.name] = [i, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_FLOAT:
doc_fields[field.name] = [
float((i + 1) + 0.1),
float((i + 1) + 1.1),
float((i + 1) + 2.1),
]
doc_fields[field.name] = [float(i + 0.1), float(i + 1.1), float(i + 2.1)]
elif field.data_type == DataType.ARRAY_DOUBLE:
doc_fields[field.name] = [
float((i + 1) + 0.11),
float((i + 1) + 1.11),
float((i + 1) + 2.11),
]
doc_fields[field.name] = [float(i + 0.11), float(i + 1.11), float(i + 2.11)]
elif field.data_type == DataType.ARRAY_STRING:
doc_fields[field.name] = [
f"test_{i + 1}",
f"test_{(i + 1) + 1}",
f"test_{(i + 1) + 2}",
]
doc_fields[field.name] = [f"test_{i}", f"test_{i + 1}", f"test_{i + 2}"]
else:
raise ValueError(f"Unsupported field type: {field.data_type}")
for vector in schema.vectors:
if vector.data_type == DataType.VECTOR_FP16:
doc_vectors[vector.name] = generate_constant_vector_recall(
i, vector.dimension, "float16"
)
elif vector.data_type == DataType.VECTOR_FP32:
doc_vectors[vector.name] = generate_constant_vector_recall(
i, vector.dimension, "float32"
)
elif vector.data_type == DataType.VECTOR_INT8:
doc_vectors[vector.name] = generate_constant_vector_recall(
i,
vector.dimension,
"int8",
)
elif vector.data_type == DataType.SPARSE_VECTOR_FP32:
doc_vectors[vector.name] = generate_sparse_vector(i)
elif vector.data_type == DataType.SPARSE_VECTOR_FP16:
doc_vectors[vector.name] = generate_sparse_vector(i)
else:
raise ValueError(f"Unsupported vector type: {vector.data_type}")
return doc_fields, doc_vectors

def generate_vectordict_update(i: int, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
doc_fields = {}
doc_vectors = {}
for field in schema.fields:
if field.data_type == DataType.BOOL:
doc_fields[field.name] = (i+1) % 2 == 0
elif field.data_type == DataType.INT32:
doc_fields[field.name] = i+1
elif field.data_type == DataType.UINT32:
doc_fields[field.name] = i+1
elif field.data_type == DataType.INT64:
doc_fields[field.name] = i+1
elif field.data_type == DataType.UINT64:
doc_fields[field.name] = i+1
elif field.data_type == DataType.FLOAT:
doc_fields[field.name] = float(i+1) + 0.1
elif field.data_type == DataType.DOUBLE:
doc_fields[field.name] = float(i+1) + 0.11
elif field.data_type == DataType.STRING:
doc_fields[field.name] = f"test_{i+1}"
elif field.data_type == DataType.ARRAY_BOOL:
doc_fields[field.name] = [(i+1) % 2 == 0, (i+1) % 3 == 0]
elif field.data_type == DataType.ARRAY_INT32:
doc_fields[field.name] = [i + 1, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_UINT32:
doc_fields[field.name] = [i + 1, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_INT64:
doc_fields[field.name] = [i + 1, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_UINT64:
doc_fields[field.name] = [i + 1, i + 1, i + 2]
elif field.data_type == DataType.ARRAY_FLOAT:
doc_fields[field.name] = [float(i + 1.1), float(i + 2.1), float(i + 3.1)]
elif field.data_type == DataType.ARRAY_DOUBLE:
doc_fields[field.name] = [float(i + 1.11), float(i + 2.11), float(i + 3.11)]
elif field.data_type == DataType.ARRAY_STRING:
doc_fields[field.name] = [f"test_{i+1}", f"test_{i + 2}", f"test_{i + 3}"]
else:
raise ValueError(f"Unsupported field type: {field.data_type}")
for vector in schema.vectors:
if vector.data_type == DataType.VECTOR_FP16:
doc_vectors[vector.name] = generate_constant_vector(
i + 1, DEFAULT_VECTOR_DIMENSION, "float16"
i+1, vector.dimension, "float16"
)
elif vector.data_type == DataType.VECTOR_FP32:
doc_vectors[vector.name] = generate_constant_vector(
i + 1, DEFAULT_VECTOR_DIMENSION, "float32"
i+1, vector.dimension, "float32"
)
elif vector.data_type == DataType.VECTOR_INT8:
doc_vectors[vector.name] = generate_constant_vector(
i + 1,
DEFAULT_VECTOR_DIMENSION,
i+1,
vector.dimension,
"int8",
)
elif vector.data_type == DataType.SPARSE_VECTOR_FP32:
doc_vectors[vector.name] = generate_sparse_vector(i)
doc_vectors[vector.name] = generate_sparse_vector(i+1)
elif vector.data_type == DataType.SPARSE_VECTOR_FP16:
doc_vectors[vector.name] = generate_sparse_vector(i)
doc_vectors[vector.name] = generate_sparse_vector(i+1)
else:
raise ValueError(f"Unsupported vector type: {vector.data_type}")
return doc_fields, doc_vectors


def generate_doc(i: int, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
doc_fields, doc_vectors = generate_vectordict(i, schema)
doc = Doc(id=str(i), fields=doc_fields, vectors=doc_vectors)
return doc
def generate_doc_recall(i: int, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
doc_fields, doc_vectors = generate_vectordict_recall(i, schema)
doc = Doc(id=str(i), fields=doc_fields, vectors=doc_vectors)
return doc


def generate_update_doc(i: int, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
doc_fields, doc_vectors = generate_vectordict_update(i, schema)
doc = Doc(id=str(i), fields=doc_fields, vectors=doc_vectors)
return doc

def generate_doc_random(i, schema: CollectionSchema) -> Doc:
doc_fields = {}
doc_vectors = {}
Expand Down Expand Up @@ -357,15 +435,15 @@ def generate_vectordict_random(schema: CollectionSchema):
for vector in schema.vectors:
if vector.data_type == DataType.VECTOR_FP16:
doc_vectors[vector.name] = generate_constant_vector(
random.randint(1, 100), DEFAULT_VECTOR_DIMENSION, "float16"
random.randint(1, 100), vector.dimension, "float16"
)
elif vector.data_type == DataType.VECTOR_FP32:
doc_vectors[vector.name] = generate_constant_vector(
random.randint(1, 100), DEFAULT_VECTOR_DIMENSION, "float32"
random.randint(1, 100), vector.dimension, "float32"
)
elif vector.data_type == DataType.VECTOR_INT8:
doc_vectors[vector.name] = generate_constant_vector(
random.randint(1, 100), DEFAULT_VECTOR_DIMENSION, "int8"
random.randint(1, 100), vector.dimension, "int8"
)
elif vector.data_type == DataType.SPARSE_VECTOR_FP32:
doc_vectors[vector.name] = generate_sparse_vector(random.randint(1, 100))
Expand Down
Loading
Loading