From 229c087ef5c109a56178f0f2fee5092e106dace6 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 27 Nov 2023 14:05:02 -0800 Subject: [PATCH 01/64] initial support for MSSQL --- .gitattributes | 13 ++ .gitignore | 1 + vectordb_bench/backend/clients/__init__.py | 27 ++- .../backend/clients/mssql/config.py | 48 +++++ vectordb_bench/backend/clients/mssql/mssql.py | 186 ++++++++++++++++++ vectordb_bench/frontend/const/styles.py | 5 +- 6 files changed, 264 insertions(+), 16 deletions(-) create mode 100644 .gitattributes create mode 100644 vectordb_bench/backend/clients/mssql/config.py create mode 100644 vectordb_bench/backend/clients/mssql/mssql.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..8efbe82d5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,13 @@ +# Thanks to: https://rehansaeed.com/gitattributes-best-practices/ + +# Set default behavior to automatically normalize line endings. +* text=auto + +# Force batch scripts to always use CRLF line endings so that if a repo is accessed +# in Windows via a file share from Linux, the scripts will work. +*.{cmd,[cC][mM][dD]} text eol=crlf +*.{bat,[bB][aA][tT]} text eol=crlf + +# Force bash scripts to always use LF line endings so that if a repo is accessed +# in Unix via a file share from Windows, the scripts will work. +*.sh text eol=lf \ No newline at end of file diff --git a/.gitignore b/.gitignore index 004524444..55cc87fa2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ __MACOSX build/ venv/ .idea/ +.venv/ \ No newline at end of file diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py index 3df11610b..46e20bd00 100644 --- a/vectordb_bench/backend/clients/__init__.py +++ b/vectordb_bench/backend/clients/__init__.py @@ -29,14 +29,17 @@ class DB(Enum): QdrantCloud = "QdrantCloud" WeaviateCloud = "WeaviateCloud" PgVector = "PgVector" - PgVectoRS = "PgVectoRS" Redis = "Redis" Chroma = "Chroma" - + MSSQL = "MSSQL" @property def init_cls(self) -> Type[VectorDB]: """Import while in use""" + if self == DB.MSSQL: + from .mssql.mssql import MSSQL + return MSSQL + if self == DB.Milvus: from .milvus.milvus import Milvus return Milvus @@ -65,10 +68,6 @@ def init_cls(self) -> Type[VectorDB]: from .pgvector.pgvector import PgVector return PgVector - if self == DB.PgVectoRS: - from .pgvecto_rs.pgvecto_rs import PgVectoRS - return PgVectoRS - if self == DB.Redis: from .redis.redis import Redis return Redis @@ -80,6 +79,10 @@ def init_cls(self) -> Type[VectorDB]: @property def config_cls(self) -> Type[DBConfig]: """Import while in use""" + if self == DB.MSSQL: + from .mssql.config import MSSQLConfig + return MSSQLConfig + if self == DB.Milvus: from .milvus.config import MilvusConfig return MilvusConfig @@ -108,10 +111,6 @@ def config_cls(self) -> Type[DBConfig]: from .pgvector.config import PgVectorConfig return PgVectorConfig - if self == DB.PgVectoRS: - from .pgvecto_rs.config import PgVectoRSConfig - return PgVectoRSConfig - if self == DB.Redis: from .redis.config import RedisConfig return RedisConfig @@ -121,6 +120,10 @@ def config_cls(self) -> Type[DBConfig]: return ChromaConfig def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]: + if self == DB.MSSQL: + from .mssql.config import MSSQLVectorIndexConfig + return MSSQLVectorIndexConfig + if self == DB.Milvus: from .milvus.config import _milvus_case_config return _milvus_case_config.get(index_type) @@ -145,10 +148,6 @@ def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseCon from .pgvector.config import PgVectorIndexConfig return PgVectorIndexConfig - if self == DB.PgVectoRS: - from .pgvecto_rs.config import _pgvecto_rs_case_config - return _pgvecto_rs_case_config.get(index_type) - # DB.Pinecone, DB.Chroma, DB.Redis return EmptyDBCaseConfig diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py new file mode 100644 index 000000000..8ebb55106 --- /dev/null +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -0,0 +1,48 @@ +from pydantic import BaseModel, SecretStr +from ..api import DBConfig, DBCaseConfig, MetricType + +MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;Connect Timeout=30;" + +class MSSQLConfig(DBConfig): + server: str + database: str + uid: str + pwd: SecretStr + + def to_dict(self) -> dict: + pwd_str = self.pwd.get_secret_value() + return { + "connection_string" : MSSQL_CONNECTION_STRING_PLACEHOLDER%(self.server, self.database, self.uid, pwd_str) + } + + +class MSSQLVectorIndexConfig(BaseModel, DBCaseConfig): + metric_type: MetricType | None = None + lists: int | None = 1000 + probes: int | None = 10 + + def parse_metric(self) -> str: + if self.metric_type == MetricType.L2: + return "vector_l2_ops" + elif self.metric_type == MetricType.IP: + return "vector_ip_ops" + return "vector_cosine_ops" + + def parse_metric_fun_str(self) -> str: + if self.metric_type == MetricType.L2: + return "l2_distance" + elif self.metric_type == MetricType.IP: + return "max_inner_product" + return "cosine_distance" + + def index_param(self) -> dict: + return { + "lists" : self.lists, + "metric" : self.parse_metric() + } + + def search_param(self) -> dict: + return { + "probes" : self.probes, + "metric_fun" : self.parse_metric_fun_str() + } \ No newline at end of file diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py new file mode 100644 index 000000000..f1f880623 --- /dev/null +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -0,0 +1,186 @@ +"""Wrapper around the Azure SQL""" + +import logging +from contextlib import contextmanager +from typing import Any + +from ..api import VectorDB, DBCaseConfig + +import pyodbc +import json + +log = logging.getLogger(__name__) + +class MSSQL(VectorDB): + def __init__( + self, + dim: int, + db_config: dict, + db_case_config: DBCaseConfig, + collection_name: str = "vector", + drop_old: bool = False, + **kwargs, + ): + self.db_config = db_config + self.case_config = db_case_config + self.table_name = collection_name + "_" + str(dim) + self.dim = dim + self.schema_name = "benchmark" + + log.info("db_case_config: " + str(db_case_config)) + + log.info(f"Connecting to MSSQL...") + cnxn = pyodbc.connect(self.db_config['connection_string']) + cursor = cnxn.cursor() + + log.info(f"Creating schema...") + cursor.execute(f""" + if (schema_id('{self.schema_name}') is null) begin + exec('create schema [{self.schema_name}] authorization [dbo];') + end; + """) + cnxn.commit() + + # if drop_old: + # log.info(f"Dropping existing tables...") + # cursor.execute(f""" + # drop table if exists [{self.schema_name}].[{self.table_name}] + # """) + # cursor.execute(f""" + # drop table if exists [{self.schema_name}].[{self.table_name}_index] + # """) + # cnxn.commit() + + # log.info(f"Creating vector table...") + # cursor.execute(f""" + # create table [{self.schema_name}].[{self.table_name}] ( + # id int primary key, + # vector nvarchar(max) check(isjson(vector)=1) + # ) + # """) + # cnxn.commit() + + # log.info(f"Creating vector values (index) table...") + # cursor.execute(f""" + # create table [{self.schema_name}].[{self.table_name}_index] + # ( + # vector_id int not null, + # vector_value_id smallint not null, + # vector_value float not null + # ) + # """) + # cnxn.commit() + + # log.info(f"Creating columnstore index...") + # cursor.execute(f""" + # create clustered columnstore index cci_{self.table_name} on [{self.schema_name}].[{self.table_name}_index] + # """) + # cnxn.commit() + + cursor.close() + cnxn.close() + + @contextmanager + def init(self) -> None: + cnxn = pyodbc.connect(self.db_config['connection_string']) + self.cnxn = cnxn + cnxn.autocommit = False + yield + self.cnxn.close() + + def ready_to_load(self): + log.info(f"MSSQL ready to load") + pass + + def optimize(self): + log.info(f"MSSQL optimize") + pass + + def ready_to_search(self): + log.info(f"MSSQL ready to search") + pass + + def insert_embeddings( + self, + embeddings: list[list[float]], + metadata: list[int], + **kwargs: Any, + ) -> (int, Exception): + try: + log.info(f'Loading batch of {len(metadata)} vectors...') + return len(metadata), None + + log.info(f'Truncating staging table...') + cursor = self.cnxn.cursor() + cursor.fast_executemany = True + cursor.execute(f"truncate table [{self.schema_name}].[{self.table_name}]") + cursor.commit() + + log.info(f'Generating param list...') + params = [(metadata[i], str(embeddings[i])) for i in range(len(metadata))] + # params = list() + # for i in range(0, len(metadata)): + # params.append((metadata[i], str(embeddings[i]))) + + log.info(f'Loading staging table...') + cursor.executemany(f"insert into [{self.schema_name}].[{self.table_name}] (id, vector) values (?, ?)", params) + cursor.commit() + + log.info(f'Loading vector index table...') + cursor.execute(f""" + insert into + [{self.schema_name}].[{self.table_name}_index] + select + v.id as [vector_id], + cast([key] as int) as [vector_value_id], + cast([value] as float) as [vector_value] + from + [{self.schema_name}].[{self.table_name}] v + cross apply + openjson([vector]) + """) + cursor.commit() + + return len(metadata), None + except Exception as e: + #cursor.rollback() + log.warning(f"Failed to insert data into vector table ([{self.schema_name}].[{self.table_name}]), error: {e}") + return 0, e + + def search_embedding( + self, + query: list[float], + k: int = 100, + filters: dict | None = None, + timeout: int | None = None, + ) -> list[int]: + log.info(f'Query {k} {filters} {timeout}...') + cursor = self.cnxn.cursor() + cursor.execute(f""" + with cteVector as + ( + select + cast([key] as int) as [vector_value_id], + cast([value] as float) as [vector_value] + from + (values (?)) v(vector) + cross apply + openjson([vector]) + ) + select top({k}) + v2.vector_id, + sum(v1.[vector_value] * v2.[vector_value]) as cosine_similarity + from + cteVector v1 + inner join + [{self.schema_name}].[{self.table_name}_index] v2 on v1.vector_value_id = v2.vector_value_id + group by + v2.vector_id + order by + cosine_similarity desc + """, str(query)) + rows = cursor.fetchall() + res = [row.vector_id for row in rows] + return list(res) + + \ No newline at end of file diff --git a/vectordb_bench/frontend/const/styles.py b/vectordb_bench/frontend/const/styles.py index 52d1017a9..22017734a 100644 --- a/vectordb_bench/frontend/const/styles.py +++ b/vectordb_bench/frontend/const/styles.py @@ -43,9 +43,9 @@ def getPatternShape(i): DB.QdrantCloud: "https://assets.zilliz.com/qdrant_b691674fcd.png", DB.WeaviateCloud: "https://assets.zilliz.com/weaviate_4f6f171ebe.png", DB.PgVector: "https://assets.zilliz.com/PG_Vector_d464f2ef5f.png", - DB.PgVectoRS: "https://assets.zilliz.com/PG_Vector_d464f2ef5f.png", DB.Redis: "https://assets.zilliz.com/Redis_Cloud_74b8bfef39.png", - DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png", + DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png", + DB.MSSQL: "https://azuresql.dev/assets/azure-sql-db-100x100.png", } # RedisCloud color: #0D6EFD @@ -59,4 +59,5 @@ def getPatternShape(i): DB.WeaviateCloud.value: "#20C997", DB.PgVector.value: "#4C779A", DB.Redis.value: "#0D6EFD", + DB.MSSQL.value: "#4C779A", } From 4dee798fe330d70c2eacc127e1baff7eb87818c3 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 27 Nov 2023 14:51:07 -0800 Subject: [PATCH 02/64] updated MSSQL test --- .../frontend/const/dbCaseConfigs.py | 69 +------------------ vectordb_bench/models.py | 5 +- 2 files changed, 4 insertions(+), 70 deletions(-) diff --git a/vectordb_bench/frontend/const/dbCaseConfigs.py b/vectordb_bench/frontend/const/dbCaseConfigs.py index 1298983ff..bff623bb0 100644 --- a/vectordb_bench/frontend/const/dbCaseConfigs.py +++ b/vectordb_bench/frontend/const/dbCaseConfigs.py @@ -24,7 +24,7 @@ CaseType.Performance768D1M1P, DIVIDER, CaseType.Performance1536D5M1P, - CaseType.Performance1536D500K1P, + CaseType.Performance1536D500K1P, DIVIDER, CaseType.Performance768D10M99P, CaseType.Performance768D1M99P, @@ -111,18 +111,6 @@ class CaseConfigInput(BaseModel): }, ) -CaseConfigParamInput_EFConstruction_PgVectoRS = CaseConfigInput( - label=CaseConfigParamType.EFConstruction, - inputType=InputType.Number, - inputConfig={ - "min": 8, - "max": 512, - "value": 360, - }, - isDisplayed=lambda config: config[CaseConfigParamType.IndexType] - == IndexType.HNSW.value, -) - CaseConfigParamInput_M_ES = CaseConfigInput( label=CaseConfigParamType.M, inputType=InputType.Number, @@ -227,23 +215,6 @@ class CaseConfigInput(BaseModel): }, ) -CaseConfigParamInput_QuantizationType_PgVectoRS = CaseConfigInput( - label=CaseConfigParamType.quantizationType, - inputType=InputType.Option, - inputConfig={ - "options": ["trivial", "scalar", "product"], - }, -) - -CaseConfigParamInput_QuantizationRatio_PgVectoRS = CaseConfigInput( - label=CaseConfigParamType.quantizationRatio, - inputType=InputType.Option, - inputConfig={ - "options": ["x4", "x8", "x16", "x32", "x64"], - }, - isDisplayed=lambda config: config.get(CaseConfigParamType.quantizationType, None) - == "product", -) MilvusLoadConfig = [ CaseConfigParamInput_IndexType, @@ -281,25 +252,6 @@ class CaseConfigInput(BaseModel): PgVectorLoadingConfig = [CaseConfigParamInput_Lists] PgVectorPerformanceConfig = [CaseConfigParamInput_Lists, CaseConfigParamInput_Probes] -PgVectoRSLoadingConfig = [ - CaseConfigParamInput_IndexType, - CaseConfigParamInput_M, - CaseConfigParamInput_EFConstruction_PgVectoRS, - CaseConfigParamInput_Nlist, - CaseConfigParamInput_QuantizationType_PgVectoRS, - CaseConfigParamInput_QuantizationRatio_PgVectoRS, -] - -PgVectoRSPerformanceConfig = [ - CaseConfigParamInput_IndexType, - CaseConfigParamInput_M, - CaseConfigParamInput_EFConstruction_PgVectoRS, - CaseConfigParamInput_Nlist, - CaseConfigParamInput_Nprobe, - CaseConfigParamInput_QuantizationType_PgVectoRS, - CaseConfigParamInput_QuantizationRatio_PgVectoRS, -] - CASE_CONFIG_MAP = { DB.Milvus: { CaseType.CapacityDim960: MilvusLoadConfig, @@ -316,7 +268,7 @@ class CaseConfigInput(BaseModel): CaseType.Performance1536D5M1P: MilvusPerformanceConfig, CaseType.Performance1536D500K1P: MilvusPerformanceConfig, CaseType.Performance1536D5M99P: MilvusPerformanceConfig, - CaseType.Performance1536D500K99P: MilvusPerformanceConfig, + CaseType.Performance1536D500K99P: MilvusPerformanceConfig, }, DB.WeaviateCloud: { CaseType.CapacityDim960: WeaviateLoadConfig, @@ -369,21 +321,4 @@ class CaseConfigInput(BaseModel): CaseType.Performance1536D5M99P: PgVectorPerformanceConfig, CaseType.Performance1536D500K99P: PgVectorPerformanceConfig, }, - DB.PgVectoRS: { - CaseType.CapacityDim960: PgVectoRSLoadingConfig, - CaseType.CapacityDim128: PgVectoRSLoadingConfig, - CaseType.Performance768D100M: PgVectoRSPerformanceConfig, - CaseType.Performance768D10M: PgVectoRSPerformanceConfig, - CaseType.Performance768D1M: PgVectoRSPerformanceConfig, - CaseType.Performance768D10M1P: PgVectoRSPerformanceConfig, - CaseType.Performance768D1M1P: PgVectoRSPerformanceConfig, - CaseType.Performance768D10M99P: PgVectoRSPerformanceConfig, - CaseType.Performance768D1M99P: PgVectoRSPerformanceConfig, - CaseType.Performance1536D5M: PgVectoRSPerformanceConfig, - CaseType.Performance1536D500K: PgVectoRSPerformanceConfig, - CaseType.Performance1536D5M1P: PgVectoRSPerformanceConfig, - CaseType.Performance1536D500K1P: PgVectoRSPerformanceConfig, - CaseType.Performance1536D5M99P: PgVectorPerformanceConfig, - CaseType.Performance1536D500K99P: PgVectoRSPerformanceConfig, - }, } diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 2f9575db3..4e32181b4 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -24,11 +24,11 @@ class LoadTimeoutError(TimeoutError): pass - class PerformanceTimeoutError(TimeoutError): pass + class CaseConfigParamType(Enum): """ Value will be the key of CaseConfig.params and displayed in UI @@ -45,8 +45,6 @@ class CaseConfigParamType(Enum): numCandidates = "num_candidates" lists = "lists" probes = "probes" - quantizationType = "quantizationType" - quantizationRatio = "quantizationRatio" class CustomizedCase(BaseModel): @@ -106,6 +104,7 @@ def flush(self): db=db.value.lower(), ) + def get_db_results(self) -> dict[DB, CaseResult]: db2case = {} for res in self.results: From 65142a50f351c840fc62252133147f49d659063c Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 27 Nov 2023 15:24:20 -0800 Subject: [PATCH 03/64] added MSSQL --- pyproject.toml | 3 +++ vectordb_bench/backend/clients/mssql/mssql.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f73bc2940..a90b0679b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ all = [ "redis", "chromadb", "psycopg2", + "pyodbc" ] qdrant = [ "qdrant-client" ] @@ -64,6 +65,7 @@ pgvector = [ "pgvector", "sqlalchemy" ] pgvecto_rs = [ "psycopg2" ] redis = [ "redis" ] chromadb = [ "chromadb" ] +mssql = [ "pyodbc" ] [project.urls] "repository" = "https://github.com/zilliztech/VectorDBBench" @@ -72,3 +74,4 @@ chromadb = [ "chromadb" ] init_bench = "vectordb_bench.__main__:main" [tool.setuptools_scm] + diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index f1f880623..4c7a88bcb 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -1,4 +1,4 @@ -"""Wrapper around the Azure SQL""" +"""Wrapper around MSSQL""" import logging from contextlib import contextmanager From caacfe93e600a65a11cc90e5c40a341d2d4cbef1 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Thu, 11 Jan 2024 11:48:04 -0800 Subject: [PATCH 04/64] drop old --- vectordb_bench/backend/clients/mssql/mssql.py | 112 +++++++++--------- 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 4c7a88bcb..6ee22bf3d 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -41,41 +41,41 @@ def __init__( """) cnxn.commit() - # if drop_old: - # log.info(f"Dropping existing tables...") - # cursor.execute(f""" - # drop table if exists [{self.schema_name}].[{self.table_name}] - # """) - # cursor.execute(f""" - # drop table if exists [{self.schema_name}].[{self.table_name}_index] - # """) - # cnxn.commit() - - # log.info(f"Creating vector table...") - # cursor.execute(f""" - # create table [{self.schema_name}].[{self.table_name}] ( - # id int primary key, - # vector nvarchar(max) check(isjson(vector)=1) - # ) - # """) - # cnxn.commit() - - # log.info(f"Creating vector values (index) table...") - # cursor.execute(f""" - # create table [{self.schema_name}].[{self.table_name}_index] - # ( - # vector_id int not null, - # vector_value_id smallint not null, - # vector_value float not null - # ) - # """) - # cnxn.commit() - - # log.info(f"Creating columnstore index...") - # cursor.execute(f""" - # create clustered columnstore index cci_{self.table_name} on [{self.schema_name}].[{self.table_name}_index] - # """) - # cnxn.commit() + if drop_old: + log.info(f"Dropping existing tables...") + cursor.execute(f""" + drop table if exists [{self.schema_name}].[{self.table_name}] + """) + cursor.execute(f""" + drop table if exists [{self.schema_name}].[{self.table_name}_index] + """) + cnxn.commit() + + log.info(f"Creating vector table...") + cursor.execute(f""" + create table [{self.schema_name}].[{self.table_name}] ( + id int primary key, + vector nvarchar(max) check(isjson(vector)=1) + ) + """) + cnxn.commit() + + log.info(f"Creating vector values (index) table...") + cursor.execute(f""" + create table [{self.schema_name}].[{self.table_name}_index] + ( + vector_id int not null, + vector_value_id smallint not null, + vector_value float not null + ) + """) + cnxn.commit() + + log.info(f"Creating columnstore index...") + cursor.execute(f""" + create clustered columnstore index cci_{self.table_name} on [{self.schema_name}].[{self.table_name}_index] + """) + cnxn.commit() cursor.close() cnxn.close() @@ -108,13 +108,13 @@ def insert_embeddings( ) -> (int, Exception): try: log.info(f'Loading batch of {len(metadata)} vectors...') - return len(metadata), None + #return len(metadata), None - log.info(f'Truncating staging table...') - cursor = self.cnxn.cursor() - cursor.fast_executemany = True - cursor.execute(f"truncate table [{self.schema_name}].[{self.table_name}]") - cursor.commit() + + # log.info(f'Truncating staging table...') + # cursor.fast_executemany = True + # cursor.execute(f"truncate table [{self.schema_name}].[{self.table_name}]") + # cursor.commit() log.info(f'Generating param list...') params = [(metadata[i], str(embeddings[i])) for i in range(len(metadata))] @@ -123,23 +123,25 @@ def insert_embeddings( # params.append((metadata[i], str(embeddings[i]))) log.info(f'Loading staging table...') + cursor = self.cnxn.cursor() + cursor.fast_executemany = True cursor.executemany(f"insert into [{self.schema_name}].[{self.table_name}] (id, vector) values (?, ?)", params) cursor.commit() - log.info(f'Loading vector index table...') - cursor.execute(f""" - insert into - [{self.schema_name}].[{self.table_name}_index] - select - v.id as [vector_id], - cast([key] as int) as [vector_value_id], - cast([value] as float) as [vector_value] - from - [{self.schema_name}].[{self.table_name}] v - cross apply - openjson([vector]) - """) - cursor.commit() + # log.info(f'Loading vector index table...') + # cursor.execute(f""" + # insert into + # [{self.schema_name}].[{self.table_name}_index] + # select + # v.id as [vector_id], + # cast([key] as int) as [vector_value_id], + # cast([value] as float) as [vector_value] + # from + # [{self.schema_name}].[{self.table_name}] v + # cross apply + # openjson([vector]) + # """) + # cursor.commit() return len(metadata), None except Exception as e: From 37d8b52b68716d274d684c102df3c2afebd0ec9e Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 19 Jan 2024 09:45:10 -0800 Subject: [PATCH 05/64] support for native vectors --- .../backend/clients/mssql/config.py | 2 +- vectordb_bench/backend/clients/mssql/mssql.py | 83 +++---------------- 2 files changed, 14 insertions(+), 71 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index 8ebb55106..354a6ef6d 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, SecretStr from ..api import DBConfig, DBCaseConfig, MetricType -MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;Connect Timeout=30;" +MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;Connect Timeout=30;TrustServerCertificate=Yes" class MSSQLConfig(DBConfig): server: str diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 6ee22bf3d..7734a9233 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -30,6 +30,7 @@ def __init__( log.info("db_case_config: " + str(db_case_config)) log.info(f"Connecting to MSSQL...") + log.info(self.db_config['connection_string']) cnxn = pyodbc.connect(self.db_config['connection_string']) cursor = cnxn.cursor() @@ -45,38 +46,18 @@ def __init__( log.info(f"Dropping existing tables...") cursor.execute(f""" drop table if exists [{self.schema_name}].[{self.table_name}] - """) - cursor.execute(f""" - drop table if exists [{self.schema_name}].[{self.table_name}_index] - """) + """) cnxn.commit() log.info(f"Creating vector table...") cursor.execute(f""" create table [{self.schema_name}].[{self.table_name}] ( - id int primary key, - vector nvarchar(max) check(isjson(vector)=1) - ) - """) - cnxn.commit() - - log.info(f"Creating vector values (index) table...") - cursor.execute(f""" - create table [{self.schema_name}].[{self.table_name}_index] - ( - vector_id int not null, - vector_value_id smallint not null, - vector_value float not null + id int not null primary key nonclustered, + [vector] varbinary(8000) not null ) """) cnxn.commit() - - log.info(f"Creating columnstore index...") - cursor.execute(f""" - create clustered columnstore index cci_{self.table_name} on [{self.schema_name}].[{self.table_name}_index] - """) - cnxn.commit() - + cursor.close() cnxn.close() @@ -110,38 +91,14 @@ def insert_embeddings( log.info(f'Loading batch of {len(metadata)} vectors...') #return len(metadata), None - - # log.info(f'Truncating staging table...') - # cursor.fast_executemany = True - # cursor.execute(f"truncate table [{self.schema_name}].[{self.table_name}]") - # cursor.commit() - log.info(f'Generating param list...') params = [(metadata[i], str(embeddings[i])) for i in range(len(metadata))] - # params = list() - # for i in range(0, len(metadata)): - # params.append((metadata[i], str(embeddings[i]))) - log.info(f'Loading staging table...') + log.info(f'Loading table...') cursor = self.cnxn.cursor() cursor.fast_executemany = True - cursor.executemany(f"insert into [{self.schema_name}].[{self.table_name}] (id, vector) values (?, ?)", params) - cursor.commit() - - # log.info(f'Loading vector index table...') - # cursor.execute(f""" - # insert into - # [{self.schema_name}].[{self.table_name}_index] - # select - # v.id as [vector_id], - # cast([key] as int) as [vector_value_id], - # cast([value] as float) as [vector_value] - # from - # [{self.schema_name}].[{self.table_name}] v - # cross apply - # openjson([vector]) - # """) - # cursor.commit() + cursor.executemany(f"insert into [{self.schema_name}].[{self.table_name}] (id, [vector]) values (?, vector(cast(? as varchar(max))))", params) + cursor.commit() return len(metadata), None except Exception as e: @@ -158,26 +115,12 @@ def search_embedding( ) -> list[int]: log.info(f'Query {k} {filters} {timeout}...') cursor = self.cnxn.cursor() - cursor.execute(f""" - with cteVector as - ( - select - cast([key] as int) as [vector_value_id], - cast([value] as float) as [vector_value] - from - (values (?)) v(vector) - cross apply - openjson([vector]) - ) + cursor.execute(f""" select top({k}) - v2.vector_id, - sum(v1.[vector_value] * v2.[vector_value]) as cosine_similarity - from - cteVector v1 - inner join - [{self.schema_name}].[{self.table_name}_index] v2 on v1.vector_value_id = v2.vector_value_id - group by - v2.vector_id + id, + vector_distance('cosine', [vector], vector(cast(? as varchar(max)))) as cosine_similarity + from + [{self.schema_name}].[{self.table_name}] v order by cosine_similarity desc """, str(query)) From 7a359e323eb72376903e934c7ff13d0f6924251d Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 19 Jan 2024 12:05:03 -0800 Subject: [PATCH 06/64] notes on how to install and run --- notes.txt | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 notes.txt diff --git a/notes.txt b/notes.txt new file mode 100644 index 000000000..5d6c44808 --- /dev/null +++ b/notes.txt @@ -0,0 +1,29 @@ +create login benchmark with password = 'B3nch_mark'; +create user benchmark for login benchmark; +alter role db_owner add member benchmark; + + +--- + +to connect to SQL Server from WSL2 get the hostname (cannot use "localhost") + +echo $(hostname).local + +--- + +LAPTOP-DM-2.local +testvectors + + +-- + +install odbc if in linux + +python3.11 -m venv .venv + +. ./.venv/bin/activate + +pip install -e '.[test]' + +pip install -e '.[sql]' + From 079d020b6aaaad9729278ebc97c2a389f6ae02f9 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 19 Jan 2024 12:07:40 -0800 Subject: [PATCH 07/64] more details --- notes.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/notes.txt b/notes.txt index 5d6c44808..0febb3905 100644 --- a/notes.txt +++ b/notes.txt @@ -17,8 +17,11 @@ testvectors -- + install odbc if in linux +install python 3.11 + python3.11 -m venv .venv . ./.venv/bin/activate From 1158598cf137ab79d50cecd792f3fb7343c88ce3 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Jan 2024 17:02:44 -0800 Subject: [PATCH 08/64] fixed column name --- vectordb_bench/backend/clients/mssql/mssql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 7734a9233..eed18ee51 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -125,7 +125,7 @@ def search_embedding( cosine_similarity desc """, str(query)) rows = cursor.fetchall() - res = [row.vector_id for row in rows] + res = [row.id for row in rows] return list(res) \ No newline at end of file From d2a13f048114ea9ab8b22e9d789f579ca0202f80 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Jan 2024 17:27:41 -0800 Subject: [PATCH 09/64] removed fast_executemany as generates error --- vectordb_bench/backend/clients/mssql/mssql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index eed18ee51..07302ec66 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -96,7 +96,7 @@ def insert_embeddings( log.info(f'Loading table...') cursor = self.cnxn.cursor() - cursor.fast_executemany = True + #cursor.fast_executemany = True cursor.executemany(f"insert into [{self.schema_name}].[{self.table_name}] (id, [vector]) values (?, vector(cast(? as varchar(max))))", params) cursor.commit() From f0761777b8cd98045b61bf6ad2e385aa67d849bc Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Jan 2024 17:35:00 -0800 Subject: [PATCH 10/64] added link --- notes.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notes.txt b/notes.txt index 0febb3905..0002ebfa7 100644 --- a/notes.txt +++ b/notes.txt @@ -22,6 +22,8 @@ install odbc if in linux install python 3.11 +https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/ + python3.11 -m venv .venv . ./.venv/bin/activate From 44e319c3799fc0539d6db0b816e1bac68d67c3ae Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 09:06:57 -0800 Subject: [PATCH 11/64] send binary data instead of strings --- notes.txt | 11 +++ vectordb_bench/backend/clients/mssql/mssql.py | 72 +++++++++++++++---- 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/notes.txt b/notes.txt index 0002ebfa7..a1b2c74c8 100644 --- a/notes.txt +++ b/notes.txt @@ -32,3 +32,14 @@ pip install -e '.[test]' pip install -e '.[sql]' +python -m vectordb_bench + +export DROP_OLD=True|False +export export NUM_PER_BATCH=50000 (default is 5000) + + + + +select used_page_count * 8. / 1024. / 1024. as size_in_gb, index_id, row_count, used_page_count from sys.dm_db_partition_stats +where object_id = object_id('[benchmark].[vector_768]') + diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 07302ec66..cebc7261e 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -7,7 +7,7 @@ from ..api import VectorDB, DBCaseConfig import pyodbc -import json +import struct log = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def __init__( log.info("db_case_config: " + str(db_case_config)) log.info(f"Connecting to MSSQL...") - log.info(self.db_config['connection_string']) + #log.info(self.db_config['connection_string']) cnxn = pyodbc.connect(self.db_config['connection_string']) cursor = cnxn.cursor() @@ -43,21 +43,49 @@ def __init__( cnxn.commit() if drop_old: - log.info(f"Dropping existing tables...") + log.info(f"Dropping existing table...") cursor.execute(f""" drop table if exists [{self.schema_name}].[{self.table_name}] """) cnxn.commit() - log.info(f"Creating vector table...") - cursor.execute(f""" + log.info(f"Creating vector table...") + cursor.execute(f""" + if object_id('[{self.schema_name}].[{self.table_name}]') is null begin create table [{self.schema_name}].[{self.table_name}] ( id int not null primary key nonclustered, [vector] varbinary(8000) not null ) - """) - cnxn.commit() + end + """) + cnxn.commit() + log.info(f"Creating table type...") + cursor.execute(f""" + if type_id('dbo.vector_payload') is null begin + create type dbo.vector_payload as table + ( + id int not null, + [vector] varbinary(8000) not null + ) + end + """) + cursor.commit() + + log.info(f"Creating stored procedure...") + cursor.execute(f""" + create or alter procedure dbo.stp_load_vectors + @dummy int, + @payload dbo.vector_payload readonly + as + begin + set nocount on + insert into [{self.schema_name}].[{self.table_name}] (id, vector) select id, [vector] from @payload; + --insert into [{self.schema_name}].[{self.table_name}] (id, vector) select id, vector(cast([vector] as varchar(max))) from @payload; + end + """) + cnxn.commit() + cursor.close() cnxn.close() @@ -81,6 +109,21 @@ def ready_to_search(self): log.info(f"MSSQL ready to search") pass + def array_to_vector(self, a:list[float]) -> bytearray: + b = bytearray() + b.append(169) + b.append(170) + + b += bytearray(struct.pack("i", len(a))) + + b.append(0) + b.append(0) + + for i in range(len(a)): + b += bytearray(struct.pack("f", a[i])) + + return b + def insert_embeddings( self, embeddings: list[list[float]], @@ -92,12 +135,12 @@ def insert_embeddings( #return len(metadata), None log.info(f'Generating param list...') - params = [(metadata[i], str(embeddings[i])) for i in range(len(metadata))] + params = [(metadata[i], self.array_to_vector(embeddings[i])) for i in range(len(metadata))] log.info(f'Loading table...') cursor = self.cnxn.cursor() - #cursor.fast_executemany = True - cursor.executemany(f"insert into [{self.schema_name}].[{self.table_name}] (id, [vector]) values (?, vector(cast(? as varchar(max))))", params) + #cursor.fast_executemany = True + cursor.execute("EXEC dbo.stp_load_vectors @dummy=?, @payload=?", (1, params)) cursor.commit() return len(metadata), None @@ -113,17 +156,16 @@ def search_embedding( filters: dict | None = None, timeout: int | None = None, ) -> list[int]: - log.info(f'Query {k} {filters} {timeout}...') + log.info(f'Query top:{k} filters:{filters} timeout:{timeout}...') cursor = self.cnxn.cursor() cursor.execute(f""" select top({k}) - id, - vector_distance('cosine', [vector], vector(cast(? as varchar(max)))) as cosine_similarity + id from [{self.schema_name}].[{self.table_name}] v order by - cosine_similarity desc - """, str(query)) + vector_distance('cosine', [vector], ?) + """, self.array_to_vector(query)) rows = cursor.fetchall() res = [row.id for row in rows] return list(res) From c581321d09fb4859baa15db413626e92e947cfb1 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 10:18:21 -0800 Subject: [PATCH 12/64] added filter support --- .../backend/clients/mssql/config.py | 12 +++---- vectordb_bench/backend/clients/mssql/mssql.py | 32 +++++++++++++------ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index 354a6ef6d..72004f7f4 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -23,17 +23,17 @@ class MSSQLVectorIndexConfig(BaseModel, DBCaseConfig): def parse_metric(self) -> str: if self.metric_type == MetricType.L2: - return "vector_l2_ops" + return "euclidean" elif self.metric_type == MetricType.IP: - return "vector_ip_ops" - return "vector_cosine_ops" + return "dot" + return "cosine" def parse_metric_fun_str(self) -> str: if self.metric_type == MetricType.L2: - return "l2_distance" + return "euclidean" elif self.metric_type == MetricType.IP: - return "max_inner_product" - return "cosine_distance" + return "dot" + return "cosine" def index_param(self) -> dict: return { diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index cebc7261e..207330936 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -156,16 +156,30 @@ def search_embedding( filters: dict | None = None, timeout: int | None = None, ) -> list[int]: - log.info(f'Query top:{k} filters:{filters} timeout:{timeout}...') + search_param = self.case_config.search_param() + metric_fun = search_param["metric_fun"] + log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} timeout:{timeout}...') cursor = self.cnxn.cursor() - cursor.execute(f""" - select top({k}) - id - from - [{self.schema_name}].[{self.table_name}] v - order by - vector_distance('cosine', [vector], ?) - """, self.array_to_vector(query)) + if filters: + cursor.execute(f""" + select top({k}) + id + from + [{self.schema_name}].[{self.table_name}] v + where + id > ? + order by + vector_distance('{metric_fun}', [vector], ?) + """, int(filters.get('id')), self.array_to_vector(query)) + else: + cursor.execute(f""" + select top({k}) + id + from + [{self.schema_name}].[{self.table_name}] v + order by + vector_distance('{metric_fun}', [vector], ?) + """, self.array_to_vector(query)) rows = cursor.fetchall() res = [row.id for row in rows] return list(res) From 4c73b36cb079234dde12b683aab2f6d970af78e4 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 10:35:57 -0800 Subject: [PATCH 13/64] added notes --- notes.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/notes.txt b/notes.txt index a1b2c74c8..e01df43b2 100644 --- a/notes.txt +++ b/notes.txt @@ -12,7 +12,7 @@ echo $(hostname).local --- LAPTOP-DM-2.local -testvectors +vectordb -- @@ -24,6 +24,8 @@ install python 3.11 https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/ +create a folder + python3.11 -m venv .venv . ./.venv/bin/activate From 6c3ebf5d54afe272eab99c98963db5fe7653f172 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 10:37:30 -0800 Subject: [PATCH 14/64] added link --- notes.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notes.txt b/notes.txt index e01df43b2..cfd3f2090 100644 --- a/notes.txt +++ b/notes.txt @@ -20,6 +20,8 @@ vectordb install odbc if in linux +https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server + install python 3.11 https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/ From 47119809d75b7ca6e8fc98dafb305e9b14b825c5 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 10:52:20 -0800 Subject: [PATCH 15/64] updated for installing pip --- binary-test.py | 32 ++++++++++++++++++++++++++++++++ notes.txt | 5 +++++ tvp-test.py | 20 ++++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 binary-test.py create mode 100644 tvp-test.py diff --git a/binary-test.py b/binary-test.py new file mode 100644 index 000000000..a5bb0ae60 --- /dev/null +++ b/binary-test.py @@ -0,0 +1,32 @@ +import pyodbc +import struct +import binascii + +uid = "benchmark" +pwd = "B3nch_mark" +connection_string = ( + r"DRIVER={ODBC Driver 18 for SQL Server};" + r"SERVER=LAPTOP-DM-2.local;" + r"DATABASE=vectordb;" + f"UID={uid};PWD={pwd};TrustServerCertificate=Yes" +) + +b = bytearray() +b.append(169) +b.append(170) + +items:float = [100, 2000, 1, 0, -1, 0.3, 200] + +b += bytearray(struct.pack("i", len(items))) + +b.append(0) +b.append(0) + +for i in range(len(items)): + b += bytearray(struct.pack("f", items[i])) + + +cnxn = pyodbc.connect(connection_string, autocommit=True) +crsr = cnxn.cursor() +crsr.execute("insert into [benchmark].[vector_768] (id, [vector]) values (1, ?)", b) +crsr.close() diff --git a/notes.txt b/notes.txt index cfd3f2090..806800a77 100644 --- a/notes.txt +++ b/notes.txt @@ -26,6 +26,11 @@ install python 3.11 https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/ +instal python3.11 pip + +apt install python3.11 python3.11-distutils +curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 + create a folder python3.11 -m venv .venv diff --git a/tvp-test.py b/tvp-test.py new file mode 100644 index 000000000..da64ee0f7 --- /dev/null +++ b/tvp-test.py @@ -0,0 +1,20 @@ +import pyodbc + +uid = "benchmark" +pwd = "B3nch_mark" +connection_string = ( + r"DRIVER={ODBC Driver 18 for SQL Server};" + r"SERVER=LAPTOP-DM-2.local;" + r"DATABASE=vectordb;" + f"UID={uid};PWD={pwd};TrustServerCertificate=Yes" +) +cnxn = pyodbc.connect(connection_string, autocommit=True) +crsr = cnxn.cursor() + +print(crsr.execute("SELECT SCHEMA_NAME()").fetchval()) # default schema + +line_items = [(1, "[1,2,3]"), (2, '[1,2,3]')] +dummy = 1 +sql = "EXEC dbo.stp_load_pippo @dummy=?, @payload=?" +params = (dummy, line_items) +crsr.execute(sql, params) From 3d27fee0b1c6de787586f5fe07fcc2a4b5d33b16 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 11:25:52 -0800 Subject: [PATCH 16/64] added pip --- notes.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notes.txt b/notes.txt index 806800a77..ad544545e 100644 --- a/notes.txt +++ b/notes.txt @@ -28,7 +28,7 @@ https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ub instal python3.11 pip -apt install python3.11 python3.11-distutils +apt install python3.11 python3.11-distutils python3.11-venv curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 create a folder From 01d21bb0115260da367dd55c1c5dc31cc8da492d Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 17:02:23 -0800 Subject: [PATCH 17/64] use clustered index --- vectordb_bench/backend/clients/mssql/mssql.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 207330936..ae305cb93 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -53,9 +53,9 @@ def __init__( cursor.execute(f""" if object_id('[{self.schema_name}].[{self.table_name}]') is null begin create table [{self.schema_name}].[{self.table_name}] ( - id int not null primary key nonclustered, + id int not null primary key clustered, [vector] varbinary(8000) not null - ) + ) end """) cnxn.commit() @@ -81,7 +81,6 @@ def __init__( begin set nocount on insert into [{self.schema_name}].[{self.table_name}] (id, vector) select id, [vector] from @payload; - --insert into [{self.schema_name}].[{self.table_name}] (id, vector) select id, vector(cast([vector] as varchar(max))) from @payload; end """) cnxn.commit() @@ -173,13 +172,8 @@ def search_embedding( """, int(filters.get('id')), self.array_to_vector(query)) else: cursor.execute(f""" - select top({k}) - id - from - [{self.schema_name}].[{self.table_name}] v - order by - vector_distance('{metric_fun}', [vector], ?) - """, self.array_to_vector(query)) + exec dbo.stp_kmeans_search @p=30, @k=?, @v=? + """, k, self.array_to_vector(query)) rows = cursor.fetchall() res = [row.id for row in rows] return list(res) From dbf1ea893605185d944af32a9033717fa37af4d7 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 17:02:33 -0800 Subject: [PATCH 18/64] correct notes --- notes.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notes.txt b/notes.txt index ad544545e..24d722ddc 100644 --- a/notes.txt +++ b/notes.txt @@ -39,14 +39,14 @@ python3.11 -m venv .venv pip install -e '.[test]' -pip install -e '.[sql]' +pip install -e '.[mssql]' python -m vectordb_bench export DROP_OLD=True|False export export NUM_PER_BATCH=50000 (default is 5000) - +check windows firewall if cannot connect to sql select used_page_count * 8. / 1024. / 1024. as size_in_gb, index_id, row_count, used_page_count from sys.dm_db_partition_stats From 26e60799db0a494ae2600a8f2011e823830541e1 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 26 Jan 2024 17:02:49 -0800 Subject: [PATCH 19/64] code improvements --- binary-test.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/binary-test.py b/binary-test.py index a5bb0ae60..b87a4ac9d 100644 --- a/binary-test.py +++ b/binary-test.py @@ -12,15 +12,13 @@ ) b = bytearray() -b.append(169) -b.append(170) +b.append([169, 170]) items:float = [100, 2000, 1, 0, -1, 0.3, 200] b += bytearray(struct.pack("i", len(items))) -b.append(0) -b.append(0) +b.append([0,0]) for i in range(len(items)): b += bytearray(struct.pack("f", items[i])) @@ -28,5 +26,5 @@ cnxn = pyodbc.connect(connection_string, autocommit=True) crsr = cnxn.cursor() -crsr.execute("insert into [benchmark].[vector_768] (id, [vector]) values (1, ?)", b) +crsr.execute("insert into dbo.test_vector_binary ([vector]) values (?)", b) crsr.close() From 0409d9c7ecf636daac805c826778de83602308da Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Sun, 28 Jan 2024 14:55:45 -0800 Subject: [PATCH 20/64] updated to use binary format --- binary-test.py | 30 ------------------ notes.txt | 9 ++++++ sql/find_similar.sql | 26 ++++++++++++++++ sql/stp_filter_similar.sql | 30 ++++++++++++++++++ sql/stp_find_similar.sql | 31 +++++++++++++++++++ tvp-test.py | 20 ------------ .../backend/clients/mssql/config.py | 6 +--- vectordb_bench/backend/clients/mssql/mssql.py | 28 ++++++++++------- 8 files changed, 113 insertions(+), 67 deletions(-) delete mode 100644 binary-test.py create mode 100644 sql/find_similar.sql create mode 100644 sql/stp_filter_similar.sql create mode 100644 sql/stp_find_similar.sql delete mode 100644 tvp-test.py diff --git a/binary-test.py b/binary-test.py deleted file mode 100644 index b87a4ac9d..000000000 --- a/binary-test.py +++ /dev/null @@ -1,30 +0,0 @@ -import pyodbc -import struct -import binascii - -uid = "benchmark" -pwd = "B3nch_mark" -connection_string = ( - r"DRIVER={ODBC Driver 18 for SQL Server};" - r"SERVER=LAPTOP-DM-2.local;" - r"DATABASE=vectordb;" - f"UID={uid};PWD={pwd};TrustServerCertificate=Yes" -) - -b = bytearray() -b.append([169, 170]) - -items:float = [100, 2000, 1, 0, -1, 0.3, 200] - -b += bytearray(struct.pack("i", len(items))) - -b.append([0,0]) - -for i in range(len(items)): - b += bytearray(struct.pack("f", items[i])) - - -cnxn = pyodbc.connect(connection_string, autocommit=True) -crsr = cnxn.cursor() -crsr.execute("insert into dbo.test_vector_binary ([vector]) values (?)", b) -crsr.close() diff --git a/notes.txt b/notes.txt index 24d722ddc..da00f29d8 100644 --- a/notes.txt +++ b/notes.txt @@ -52,3 +52,12 @@ check windows firewall if cannot connect to sql select used_page_count * 8. / 1024. / 1024. as size_in_gb, index_id, row_count, used_page_count from sys.dm_db_partition_stats where object_id = object_id('[benchmark].[vector_768]') +---- + +declare @v varbinary(8000) +select @v = vector from benchmark.vector_768 where id = 1224 + +select id from [$vector].[find_similar$vector_768$vector](@v, 100, 30, 1, 'cosine') +--exec dbo.stp_kmeans_search @p=30, @k=10, @v=@v + +--exec dbo.stp_kmeans_search @p=1000, @k=10, @v=@v diff --git a/sql/find_similar.sql b/sql/find_similar.sql new file mode 100644 index 000000000..bb9155e80 --- /dev/null +++ b/sql/find_similar.sql @@ -0,0 +1,26 @@ +create or alter function [$vector].[find_similar$vector_768$vector] (@v varbinary(8000), @k int, @p int, @d float, @m varchar(50) = 'cosine') +returns table +as return +with cteProbes as +( + select top (@p) + k.cluster_id + from + [$vector].[vector_768$vector$clusters_centroids] k + order by + vector_distance(@m, k.[centroid], @v) +) +select top(@k) + v.*, + cosine_distance = vector_distance(@m, v.[vector], @v) +from + [$vector].[vector_768$vector$clusters] c +inner join + cteProbes k on k.cluster_id = c.cluster_id +inner join + [benchmark].[vector_768] v on v.id = c.item_id +where + vector_distance(@m, v.[vector], @v) <= @d +order by + cosine_distance + diff --git a/sql/stp_filter_similar.sql b/sql/stp_filter_similar.sql new file mode 100644 index 000000000..70306cd34 --- /dev/null +++ b/sql/stp_filter_similar.sql @@ -0,0 +1,30 @@ +create or alter procedure [$vector].[stp_filter_similar$vector_768$vector] +@id int, +@v varbinary(8000), +@k int, +@p int, +@m varchar(50) = 'cosine' +as +with cteProbes as +( + select top (@p) + k.cluster_id + from + [$vector].[vector_768$vector$clusters_centroids] k + order by + vector_distance(@m, k.[centroid], @v) +) +select top(@k) + v.id +from + [$vector].[vector_768$vector$clusters] c +inner join + cteProbes k on k.cluster_id = c.cluster_id +inner join + [benchmark].[vector_768] v on v.id = c.item_id +where + v.id > @id +order by + vector_distance(@m, v.[vector], @v) + +GO diff --git a/sql/stp_find_similar.sql b/sql/stp_find_similar.sql new file mode 100644 index 000000000..949c2900a --- /dev/null +++ b/sql/stp_find_similar.sql @@ -0,0 +1,31 @@ +SET ANSI_NULLS ON +GO +SET QUOTED_IDENTIFIER ON +GO +create or alter procedure [$vector].[stp_find_similar$vector_768$vector] +@v varbinary(8000), +@k int, +@p int, +@m varchar(50) = 'cosine' +as +with cteProbes as +( + select top (@p) + k.cluster_id + from + [$vector].[vector_768$vector$clusters_centroids] k + order by + vector_distance(@m, k.[centroid], @v) +) +select top(@k) + v.id +from + [$vector].[vector_768$vector$clusters] c +inner join + cteProbes k on k.cluster_id = c.cluster_id +inner join + [benchmark].[vector_768] v on v.id = c.item_id +order by + vector_distance(@m, v.[vector], @v) + +GO diff --git a/tvp-test.py b/tvp-test.py deleted file mode 100644 index da64ee0f7..000000000 --- a/tvp-test.py +++ /dev/null @@ -1,20 +0,0 @@ -import pyodbc - -uid = "benchmark" -pwd = "B3nch_mark" -connection_string = ( - r"DRIVER={ODBC Driver 18 for SQL Server};" - r"SERVER=LAPTOP-DM-2.local;" - r"DATABASE=vectordb;" - f"UID={uid};PWD={pwd};TrustServerCertificate=Yes" -) -cnxn = pyodbc.connect(connection_string, autocommit=True) -crsr = cnxn.cursor() - -print(crsr.execute("SELECT SCHEMA_NAME()").fetchval()) # default schema - -line_items = [(1, "[1,2,3]"), (2, '[1,2,3]')] -dummy = 1 -sql = "EXEC dbo.stp_load_pippo @dummy=?, @payload=?" -params = (dummy, line_items) -crsr.execute(sql, params) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index 72004f7f4..e8b5c3b72 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -29,11 +29,7 @@ def parse_metric(self) -> str: return "cosine" def parse_metric_fun_str(self) -> str: - if self.metric_type == MetricType.L2: - return "euclidean" - elif self.metric_type == MetricType.IP: - return "dot" - return "cosine" + return self.parse_metric() def index_param(self) -> dict: return { diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index ae305cb93..3a5012926 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -157,23 +157,27 @@ def search_embedding( ) -> list[int]: search_param = self.case_config.search_param() metric_fun = search_param["metric_fun"] - log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} timeout:{timeout}...') + log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cnxn.cursor() if filters: cursor.execute(f""" - select top({k}) - id - from - [{self.schema_name}].[{self.table_name}] v - where - id > ? - order by - vector_distance('{metric_fun}', [vector], ?) - """, int(filters.get('id')), self.array_to_vector(query)) + exec [$vector].[stp_filter_similar${self.table_name}$vector] @id=?, @v=?, @k=?, @p=?, @m=? + """, + int(filters.get('id')), + self.array_to_vector(query), + k, + int(search_param["probes"]), + metric_fun + ) else: cursor.execute(f""" - exec dbo.stp_kmeans_search @p=30, @k=?, @v=? - """, k, self.array_to_vector(query)) + exec [$vector].[stp_find_similar${self.table_name}$vector] @v=?, @k=?, @p=?, @m=? + """, + self.array_to_vector(query), + k, + int(search_param["probes"]), + metric_fun + ) rows = cursor.fetchall() res = [row.id for row in rows] return list(res) From a30019aa43a78cbbbcb568c9ebcf520b99c02d53 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Sun, 28 Jan 2024 15:14:09 -0800 Subject: [PATCH 21/64] smaller optimizations --- vectordb_bench/backend/clients/mssql/mssql.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 3a5012926..5c92b144f 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -92,8 +92,10 @@ def __init__( def init(self) -> None: cnxn = pyodbc.connect(self.db_config['connection_string']) self.cnxn = cnxn - cnxn.autocommit = False + cnxn.autocommit = True + self.cursor = cnxn.cursor() yield + self.cursor.close() self.cnxn.close() def ready_to_load(self): @@ -109,17 +111,18 @@ def ready_to_search(self): pass def array_to_vector(self, a:list[float]) -> bytearray: - b = bytearray() - b.append(169) - b.append(170) + # header + b = bytearray([169, 170]) + # number of items b += bytearray(struct.pack("i", len(a))) + pf = f"{len(a)}f" - b.append(0) - b.append(0) + # filler + b += bytearray([0,0]) - for i in range(len(a)): - b += bytearray(struct.pack("f", a[i])) + # items + b += bytearray(struct.pack(pf, *a)) return b @@ -137,10 +140,9 @@ def insert_embeddings( params = [(metadata[i], self.array_to_vector(embeddings[i])) for i in range(len(metadata))] log.info(f'Loading table...') - cursor = self.cnxn.cursor() + cursor = self.cursor #cursor.fast_executemany = True - cursor.execute("EXEC dbo.stp_load_vectors @dummy=?, @payload=?", (1, params)) - cursor.commit() + cursor.execute("EXEC dbo.stp_load_vectors @dummy=?, @payload=?", (1, params)) return len(metadata), None except Exception as e: @@ -157,8 +159,8 @@ def search_embedding( ) -> list[int]: search_param = self.case_config.search_param() metric_fun = search_param["metric_fun"] - log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') - cursor = self.cnxn.cursor() + #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') + cursor = self.cursor if filters: cursor.execute(f""" exec [$vector].[stp_filter_similar${self.table_name}$vector] @id=?, @v=?, @k=?, @p=?, @m=? @@ -180,6 +182,6 @@ def search_embedding( ) rows = cursor.fetchall() res = [row.id for row in rows] - return list(res) + return res \ No newline at end of file From f1b3f43f8d265a9f66ed4ea5e627d4f74139d24b Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Wed, 31 Jan 2024 13:38:26 -0800 Subject: [PATCH 22/64] improved scripts --- sql/schema.sql | 1 + sql/stp_find_similar.sql | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) create mode 100644 sql/schema.sql diff --git a/sql/schema.sql b/sql/schema.sql new file mode 100644 index 000000000..bf4f7eefe --- /dev/null +++ b/sql/schema.sql @@ -0,0 +1 @@ +create schema [$vector] authorization [dbo] diff --git a/sql/stp_find_similar.sql b/sql/stp_find_similar.sql index 949c2900a..3d206e2f0 100644 --- a/sql/stp_find_similar.sql +++ b/sql/stp_find_similar.sql @@ -1,7 +1,3 @@ -SET ANSI_NULLS ON -GO -SET QUOTED_IDENTIFIER ON -GO create or alter procedure [$vector].[stp_find_similar$vector_768$vector] @v varbinary(8000), @k int, From 6e2013c2ec492a88d101f7a669b93cdc4e03e3cb Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Wed, 31 Jan 2024 13:38:56 -0800 Subject: [PATCH 23/64] corrected header values to comply with latest bits --- vectordb_bench/backend/clients/mssql/mssql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 5c92b144f..1c8e6c789 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -112,7 +112,7 @@ def ready_to_search(self): def array_to_vector(self, a:list[float]) -> bytearray: # header - b = bytearray([169, 170]) + b = bytearray([169, 1]) # number of items b += bytearray(struct.pack("i", len(a))) From 38a8497a89a04b34bcbc0bad4f20800e9b98ded0 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Apr 2024 11:47:13 -0700 Subject: [PATCH 24/64] using vanilla vector_distance --- sql/stp_filter_similar.sql | 21 ++----------- sql/stp_filter_similar_kmeans.sql | 30 +++++++++++++++++++ sql/stp_find_similar.sql | 19 ++---------- sql/stp_find_similar_kmeans.sql | 27 +++++++++++++++++ vectordb_bench/backend/clients/mssql/mssql.py | 11 ++++--- 5 files changed, 67 insertions(+), 41 deletions(-) create mode 100644 sql/stp_filter_similar_kmeans.sql create mode 100644 sql/stp_find_similar_kmeans.sql diff --git a/sql/stp_filter_similar.sql b/sql/stp_filter_similar.sql index 70306cd34..6a962ca97 100644 --- a/sql/stp_filter_similar.sql +++ b/sql/stp_filter_similar.sql @@ -1,30 +1,15 @@ -create or alter procedure [$vector].[stp_filter_similar$vector_768$vector] +create or alter procedure [$vector].[stp_filter_similar] @id int, @v varbinary(8000), @k int, -@p int, @m varchar(50) = 'cosine' as -with cteProbes as -( - select top (@p) - k.cluster_id - from - [$vector].[vector_768$vector$clusters_centroids] k - order by - vector_distance(@m, k.[centroid], @v) -) select top(@k) v.id from - [$vector].[vector_768$vector$clusters] c -inner join - cteProbes k on k.cluster_id = c.cluster_id -inner join - [benchmark].[vector_768] v on v.id = c.item_id + [benchmark].[vector_768] v where v.id > @id order by - vector_distance(@m, v.[vector], @v) - + vector_distance(@m, @v, v.[vector]) GO diff --git a/sql/stp_filter_similar_kmeans.sql b/sql/stp_filter_similar_kmeans.sql new file mode 100644 index 000000000..70306cd34 --- /dev/null +++ b/sql/stp_filter_similar_kmeans.sql @@ -0,0 +1,30 @@ +create or alter procedure [$vector].[stp_filter_similar$vector_768$vector] +@id int, +@v varbinary(8000), +@k int, +@p int, +@m varchar(50) = 'cosine' +as +with cteProbes as +( + select top (@p) + k.cluster_id + from + [$vector].[vector_768$vector$clusters_centroids] k + order by + vector_distance(@m, k.[centroid], @v) +) +select top(@k) + v.id +from + [$vector].[vector_768$vector$clusters] c +inner join + cteProbes k on k.cluster_id = c.cluster_id +inner join + [benchmark].[vector_768] v on v.id = c.item_id +where + v.id > @id +order by + vector_distance(@m, v.[vector], @v) + +GO diff --git a/sql/stp_find_similar.sql b/sql/stp_find_similar.sql index 3d206e2f0..c587ea8d5 100644 --- a/sql/stp_find_similar.sql +++ b/sql/stp_find_similar.sql @@ -1,27 +1,12 @@ create or alter procedure [$vector].[stp_find_similar$vector_768$vector] @v varbinary(8000), @k int, -@p int, @m varchar(50) = 'cosine' as -with cteProbes as -( - select top (@p) - k.cluster_id - from - [$vector].[vector_768$vector$clusters_centroids] k - order by - vector_distance(@m, k.[centroid], @v) -) select top(@k) v.id from - [$vector].[vector_768$vector$clusters] c -inner join - cteProbes k on k.cluster_id = c.cluster_id -inner join - [benchmark].[vector_768] v on v.id = c.item_id + [benchmark].[vector_768] v order by - vector_distance(@m, v.[vector], @v) - + vector_distance(@m, @v, v.[vector]) GO diff --git a/sql/stp_find_similar_kmeans.sql b/sql/stp_find_similar_kmeans.sql new file mode 100644 index 000000000..3d206e2f0 --- /dev/null +++ b/sql/stp_find_similar_kmeans.sql @@ -0,0 +1,27 @@ +create or alter procedure [$vector].[stp_find_similar$vector_768$vector] +@v varbinary(8000), +@k int, +@p int, +@m varchar(50) = 'cosine' +as +with cteProbes as +( + select top (@p) + k.cluster_id + from + [$vector].[vector_768$vector$clusters_centroids] k + order by + vector_distance(@m, k.[centroid], @v) +) +select top(@k) + v.id +from + [$vector].[vector_768$vector$clusters] c +inner join + cteProbes k on k.cluster_id = c.cluster_id +inner join + [benchmark].[vector_768] v on v.id = c.item_id +order by + vector_distance(@m, v.[vector], @v) + +GO diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 1c8e6c789..df3c63b6e 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -159,25 +159,24 @@ def search_embedding( ) -> list[int]: search_param = self.case_config.search_param() metric_fun = search_param["metric_fun"] + probes = int(search_param["probes"]), #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: cursor.execute(f""" - exec [$vector].[stp_filter_similar${self.table_name}$vector] @id=?, @v=?, @k=?, @p=?, @m=? + exec [$vector].[stp_filter_similar] @id=?, @v=?, @k=?, @m=? """, int(filters.get('id')), self.array_to_vector(query), - k, - int(search_param["probes"]), + k, metric_fun ) else: cursor.execute(f""" - exec [$vector].[stp_find_similar${self.table_name}$vector] @v=?, @k=?, @p=?, @m=? + exec [$vector].[stp_find_similar] @v=?, @k=?, @m=? """, self.array_to_vector(query), - k, - int(search_param["probes"]), + k, metric_fun ) rows = cursor.fetchall() From 075f011a2ab45becbdff46107332d90f937c2a13 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Apr 2024 11:47:30 -0700 Subject: [PATCH 25/64] added setup instructions for MSSQL --- MSSQL-Setup.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 MSSQL-Setup.md diff --git a/MSSQL-Setup.md b/MSSQL-Setup.md new file mode 100644 index 000000000..3a22e8b07 --- /dev/null +++ b/MSSQL-Setup.md @@ -0,0 +1,53 @@ +# Run VectorDBBench agains MSSQL database + +VectorDBBench has been tested running on WSL2 + Ubuntu 22.04.4 LTS. + +## Install ODBC + +Follow instructions here: https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server + +## Install Python 3.11 + +Follow instructions here: https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/) + +## Install pip for Python3.11 : + +Use the following commands: + +``` +apt install python3.11 python3.11-distutils python3.11-venv +curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 +``` + +## Clone the repository + +Clone the repository into a local folder + +## Create Virtual Environment + +In local folder where you have cloned the repository, create a virtual environment: + +``` +python3.11 -m venv .venv +``` + +then activate it: + +``` +. ./.venv/bin/activate +``` + +## Install VectorDBBench dependencies + +Install the VectorDBBench dependencies + +``` +pip install -e '.[test]' +pip install -e '.[mssql]' +``` + +## Run VectorDBBench + +``` +python -m vectordb_bench +``` \ No newline at end of file From fe356a6539813690d81e59e5a6770bbdda0ec4a8 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Apr 2024 11:57:58 -0700 Subject: [PATCH 26/64] using ad hoc sql instead of stored procedure to find and filter --- vectordb_bench/backend/clients/mssql/mssql.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index df3c63b6e..1b834de23 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -158,26 +158,26 @@ def search_embedding( timeout: int | None = None, ) -> list[int]: search_param = self.case_config.search_param() - metric_fun = search_param["metric_fun"] - probes = int(search_param["probes"]), + metric_function = search_param["metric_fun"] + #probes = int(search_param["probes"]), #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: cursor.execute(f""" - exec [$vector].[stp_filter_similar] @id=?, @v=?, @k=?, @m=? + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id > @id order by vector_distance(@m, @v, v.[vector]) """, - int(filters.get('id')), - self.array_to_vector(query), k, - metric_fun + int(filters.get('id')), + metric_function. + self.array_to_vector(query) ) else: cursor.execute(f""" - exec [$vector].[stp_find_similar] @v=?, @k=?, @m=? + select top(@k) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(@m, @v, v.[vector]) """, - self.array_to_vector(query), k, - metric_fun + metric_function, + self.array_to_vector(query) ) rows = cursor.fetchall() res = [row.id for row in rows] From 588695b26eef1bcf6b678bba1d056b238d2f3f0b Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Apr 2024 12:11:30 -0700 Subject: [PATCH 27/64] fixed code --- vectordb_bench/backend/clients/mssql/mssql.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 1b834de23..fdfaedd41 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -164,16 +164,16 @@ def search_embedding( cursor = self.cursor if filters: cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id > @id order by vector_distance(@m, @v, v.[vector]) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id > ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) """, k, int(filters.get('id')), - metric_function. + metric_function, self.array_to_vector(query) ) else: cursor.execute(f""" - select top(@k) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(@m, @v, v.[vector]) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) """, k, metric_function, From 09bf65b834e976e127f53305cae410876d30f36f Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 22 Apr 2024 12:27:14 -0700 Subject: [PATCH 28/64] fixed code --- vectordb_bench/backend/clients/mssql/mssql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index fdfaedd41..afb4a0539 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -59,7 +59,7 @@ def __init__( end """) cnxn.commit() - + log.info(f"Creating table type...") cursor.execute(f""" if type_id('dbo.vector_payload') is null begin @@ -164,7 +164,7 @@ def search_embedding( cursor = self.cursor if filters: cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id > ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) """, k, int(filters.get('id')), From 654c7ddadf07214cbfb5d874e8bc443a2c5379e8 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Thu, 25 Apr 2024 16:51:32 -0700 Subject: [PATCH 29/64] Update MSSQL-Setup.md --- MSSQL-Setup.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MSSQL-Setup.md b/MSSQL-Setup.md index 3a22e8b07..bfb1f397f 100644 --- a/MSSQL-Setup.md +++ b/MSSQL-Setup.md @@ -15,7 +15,7 @@ Follow instructions here: https://ubuntuhandbook.org/index.php/2022/10/python-3- Use the following commands: ``` -apt install python3.11 python3.11-distutils python3.11-venv +sudo apt install python3.11 python3.11-distutils python3.11-venv curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 ``` @@ -50,4 +50,4 @@ pip install -e '.[mssql]' ``` python -m vectordb_bench -``` \ No newline at end of file +``` From 59585600d9e544fde15467ac610bf030039d6057 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 3 May 2024 16:43:37 -0700 Subject: [PATCH 30/64] fixed dependency error --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index a90b0679b..6236d6a67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,10 @@ build-backend = "setuptools.build_meta" [tool.setuptools.package-data] "vectordb_bench.results" = ["*.json"] +[tool.setuptools.packages.find] +where = ["."] +include = ["vectordb_bench"] + [project] name = "vectordb-bench" authors = [ From c687c85df209c1078a0fb3155de27cfc66227b02 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 3 May 2024 19:14:27 -0700 Subject: [PATCH 31/64] pin streamlit to 1.31.1 to avoid incompatibilities --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6236d6a67..026d85784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ dependencies = [ "pytz", "streamlit-autorefresh", - "streamlit>=1.23.0", + "streamlit==1.31.1", "streamlit_extras", "tqdm", "s3fs", From b881af832ca3d48efee3b2454cbaa4dea80cfb26 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 3 May 2024 19:40:18 -0700 Subject: [PATCH 32/64] merged with latest version --- notes.txt | 2 +- vectordb_bench/backend/clients/__init__.py | 17 +++++++++++++++-- vectordb_bench/frontend/const/dbCaseConfigs.py | 14 +++++++++++++- vectordb_bench/frontend/const/styles.py | 7 ++++--- vectordb_bench/models.py | 3 +-- 5 files changed, 34 insertions(+), 9 deletions(-) diff --git a/notes.txt b/notes.txt index da00f29d8..f2be0fe0d 100644 --- a/notes.txt +++ b/notes.txt @@ -1,4 +1,4 @@ -create login benchmark with password = 'B3nch_mark'; +create login benchmark with password = 'BOR23bSu0ZsDc9zR8wdHn2w0P5yoEbBD'; create user benchmark for login benchmark; alter role db_owner add member benchmark; diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py index e8e83851a..e59a90556 100644 --- a/vectordb_bench/backend/clients/__init__.py +++ b/vectordb_bench/backend/clients/__init__.py @@ -29,6 +29,7 @@ class DB(Enum): QdrantCloud = "QdrantCloud" WeaviateCloud = "WeaviateCloud" PgVector = "PgVector" + PgVectoRS = "PgVectoRS" Redis = "Redis" Chroma = "Chroma" MSSQL = "MSSQL" @@ -39,7 +40,7 @@ def init_cls(self) -> Type[VectorDB]: if self == DB.MSSQL: from .mssql.mssql import MSSQL return MSSQL - + if self == DB.Milvus: from .milvus.milvus import Milvus return Milvus @@ -68,6 +69,10 @@ def init_cls(self) -> Type[VectorDB]: from .pgvector.pgvector import PgVector return PgVector + if self == DB.PgVectoRS: + from .pgvecto_rs.pgvecto_rs import PgVectoRS + return PgVectoRS + if self == DB.Redis: from .redis.redis import Redis return Redis @@ -111,6 +116,10 @@ def config_cls(self) -> Type[DBConfig]: from .pgvector.config import PgVectorConfig return PgVectorConfig + if self == DB.PgVectoRS: + from .pgvecto_rs.config import PgVectoRSConfig + return PgVectoRSConfig + if self == DB.Redis: from .redis.config import RedisConfig return RedisConfig @@ -123,7 +132,7 @@ def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseCon if self == DB.MSSQL: from .mssql.config import MSSQLVectorIndexConfig return MSSQLVectorIndexConfig - + if self == DB.Milvus: from .milvus.config import _milvus_case_config return _milvus_case_config.get(index_type) @@ -148,6 +157,10 @@ def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseCon from .pgvector.config import _pgvector_case_config return _pgvector_case_config.get(index_type) + if self == DB.PgVectoRS: + from .pgvecto_rs.config import _pgvecto_rs_case_config + return _pgvecto_rs_case_config.get(index_type) + # DB.Pinecone, DB.Chroma, DB.Redis return EmptyDBCaseConfig diff --git a/vectordb_bench/frontend/const/dbCaseConfigs.py b/vectordb_bench/frontend/const/dbCaseConfigs.py index 672d54ce8..9b122910a 100644 --- a/vectordb_bench/frontend/const/dbCaseConfigs.py +++ b/vectordb_bench/frontend/const/dbCaseConfigs.py @@ -24,7 +24,7 @@ CaseType.Performance768D1M1P, DIVIDER, CaseType.Performance1536D5M1P, - CaseType.Performance1536D500K1P, + CaseType.Performance1536D500K1P, DIVIDER, CaseType.Performance768D10M99P, CaseType.Performance768D1M99P, @@ -115,6 +115,18 @@ class CaseConfigInput(BaseModel): }, ) +CaseConfigParamInput_EFConstruction_PgVectoRS = CaseConfigInput( + label=CaseConfigParamType.EFConstruction, + inputType=InputType.Number, + inputConfig={ + "min": 8, + "max": 512, + "value": 360, + }, + isDisplayed=lambda config: config[CaseConfigParamType.IndexType] + == IndexType.HNSW.value, +) + CaseConfigParamInput_M_ES = CaseConfigInput( label=CaseConfigParamType.M, inputType=InputType.Number, diff --git a/vectordb_bench/frontend/const/styles.py b/vectordb_bench/frontend/const/styles.py index 22017734a..b51d0c7eb 100644 --- a/vectordb_bench/frontend/const/styles.py +++ b/vectordb_bench/frontend/const/styles.py @@ -43,9 +43,10 @@ def getPatternShape(i): DB.QdrantCloud: "https://assets.zilliz.com/qdrant_b691674fcd.png", DB.WeaviateCloud: "https://assets.zilliz.com/weaviate_4f6f171ebe.png", DB.PgVector: "https://assets.zilliz.com/PG_Vector_d464f2ef5f.png", + DB.PgVectoRS: "https://assets.zilliz.com/PG_Vector_d464f2ef5f.png", DB.Redis: "https://assets.zilliz.com/Redis_Cloud_74b8bfef39.png", - DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png", - DB.MSSQL: "https://azuresql.dev/assets/azure-sql-db-100x100.png", + DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png", + DB.MSSQL: "https://azuresql.dev/assets/azure-sql-db-100x100.png" } # RedisCloud color: #0D6EFD @@ -59,5 +60,5 @@ def getPatternShape(i): DB.WeaviateCloud.value: "#20C997", DB.PgVector.value: "#4C779A", DB.Redis.value: "#0D6EFD", - DB.MSSQL.value: "#4C779A", + DB.MSSQL.value: "#4C779A" } diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 9453a9c17..3c2a5b9aa 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -24,11 +24,11 @@ class LoadTimeoutError(TimeoutError): pass + class PerformanceTimeoutError(TimeoutError): pass - class CaseConfigParamType(Enum): """ Value will be the key of CaseConfig.params and displayed in UI @@ -118,7 +118,6 @@ def flush(self): db=db.value.lower(), ) - def get_db_results(self) -> dict[DB, CaseResult]: db2case = {} for res in self.results: From be1098991184a2a5b9f79d75e5e5aaf18e2f25c9 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 3 May 2024 20:00:42 -0700 Subject: [PATCH 33/64] refactored code --- .../backend/clients/mssql/config.py | 10 ++---- vectordb_bench/backend/clients/mssql/mssql.py | 31 ++++++++++++++----- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index e8b5c3b72..a12821db1 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -18,8 +18,7 @@ def to_dict(self) -> dict: class MSSQLVectorIndexConfig(BaseModel, DBCaseConfig): metric_type: MetricType | None = None - lists: int | None = 1000 - probes: int | None = 10 + efSearch: int | None = 48 def parse_metric(self) -> str: if self.metric_type == MetricType.L2: @@ -28,9 +27,6 @@ def parse_metric(self) -> str: return "dot" return "cosine" - def parse_metric_fun_str(self) -> str: - return self.parse_metric() - def index_param(self) -> dict: return { "lists" : self.lists, @@ -39,6 +35,6 @@ def index_param(self) -> dict: def search_param(self) -> dict: return { - "probes" : self.probes, - "metric_fun" : self.parse_metric_fun_str() + "efSearch" : self.efSearch, + "metric" : self.parse_metric() } \ No newline at end of file diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index afb4a0539..2560e499c 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -158,26 +158,43 @@ def search_embedding( timeout: int | None = None, ) -> list[int]: search_param = self.case_config.search_param() - metric_function = search_param["metric_fun"] - #probes = int(search_param["probes"]), + metric_function = search_param["metric"] + efSearch = search_param["efSearch"] #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: + # cursor.execute(f""" + # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + # """, + # k, + # int(filters.get('id')), + # metric_function, + # self.array_to_vector(query) + # ) cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + select id from [{self.schema_name}].[{self.table_name}$hsnw_filter](?,?,?,?) """, + metric_function, k, + int(efSearch), int(filters.get('id')), - metric_function, self.array_to_vector(query) ) else: + # cursor.execute(f""" + # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + # """, + # k, + # metric_function, + # self.array_to_vector(query) + # ) cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + select id from [{self.schema_name}].[{self.table_name}$hsnw_search](?,?,?,?) """, - k, metric_function, - self.array_to_vector(query) + k, + int(efSearch), + self.array_to_vector(query) ) rows = cursor.fetchall() res = [row.id for row in rows] From c67bd37fefe2bc49be02f4276191048d72f2db82 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Thu, 23 May 2024 13:35:00 -0700 Subject: [PATCH 34/64] updated notes --- notes.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notes.txt b/notes.txt index f2be0fe0d..0ea75f0b6 100644 --- a/notes.txt +++ b/notes.txt @@ -14,7 +14,7 @@ echo $(hostname).local LAPTOP-DM-2.local vectordb - + export LOG_LEVEL="DEBUG" -- From 000458cd30cd8f28ae788abdd1770a93517f0220 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 26 Aug 2024 11:43:56 -0700 Subject: [PATCH 35/64] using plain vector_distance --- vectordb_bench/backend/clients/mssql/mssql.py | 33 +++++-------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 2560e499c..c288a02e0 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -2,7 +2,7 @@ import logging from contextlib import contextmanager -from typing import Any +from typing import Any, Generator, Optional, Tuple, Sequence from ..api import VectorDB, DBCaseConfig @@ -89,7 +89,7 @@ def __init__( cnxn.close() @contextmanager - def init(self) -> None: + def init(self) -> Generator[None, None, None]: cnxn = pyodbc.connect(self.db_config['connection_string']) self.cnxn = cnxn cnxn.autocommit = True @@ -131,7 +131,7 @@ def insert_embeddings( embeddings: list[list[float]], metadata: list[int], **kwargs: Any, - ) -> (int, Exception): + ) -> Tuple[int, Optional[Exception]]: try: log.info(f'Loading batch of {len(metadata)} vectors...') #return len(metadata), None @@ -163,38 +163,21 @@ def search_embedding( #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: - # cursor.execute(f""" - # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) - # """, - # k, - # int(filters.get('id')), - # metric_function, - # self.array_to_vector(query) - # ) cursor.execute(f""" - select id from [{self.schema_name}].[{self.table_name}$hsnw_filter](?,?,?,?) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) """, - metric_function, k, - int(efSearch), int(filters.get('id')), + metric_function, self.array_to_vector(query) ) else: - # cursor.execute(f""" - # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) - # """, - # k, - # metric_function, - # self.array_to_vector(query) - # ) cursor.execute(f""" - select id from [{self.schema_name}].[{self.table_name}$hsnw_search](?,?,?,?) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) """, - metric_function, k, - int(efSearch), - self.array_to_vector(query) + metric_function, + self.array_to_vector(query) ) rows = cursor.fetchall() res = [row.id for row in rows] From 923e17748f6910b4aed00814a9c800a56db47ffe Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Mon, 26 Aug 2024 11:45:14 -0700 Subject: [PATCH 36/64] commented unused parameter --- vectordb_bench/backend/clients/mssql/mssql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index c288a02e0..6df8e91a5 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -159,7 +159,7 @@ def search_embedding( ) -> list[int]: search_param = self.case_config.search_param() metric_function = search_param["metric"] - efSearch = search_param["efSearch"] + #efSearch = search_param["efSearch"] #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: From e281dc3423b00be08a1f88f1a338a331cb253775 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Thu, 9 Jan 2025 22:53:26 +0000 Subject: [PATCH 37/64] support for vector type --- .../backend/clients/mssql/config.py | 2 +- vectordb_bench/backend/clients/mssql/mssql.py | 28 ++++--------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index a12821db1..53990dfa5 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, SecretStr from ..api import DBConfig, DBCaseConfig, MetricType -MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;Connect Timeout=30;TrustServerCertificate=Yes" +MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;TrustServerCertificate=Yes" class MSSQLConfig(DBConfig): server: str diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 6df8e91a5..49f086c91 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -7,7 +7,7 @@ from ..api import VectorDB, DBCaseConfig import pyodbc -import struct +import json log = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def __init__( log.info("db_case_config: " + str(db_case_config)) log.info(f"Connecting to MSSQL...") - #log.info(self.db_config['connection_string']) + log.info(self.db_config['connection_string']) cnxn = pyodbc.connect(self.db_config['connection_string']) cursor = cnxn.cursor() @@ -54,7 +54,7 @@ def __init__( if object_id('[{self.schema_name}].[{self.table_name}]') is null begin create table [{self.schema_name}].[{self.table_name}] ( id int not null primary key clustered, - [vector] varbinary(8000) not null + [vector] vector({self.dim}) not null ) end """) @@ -66,7 +66,7 @@ def __init__( create type dbo.vector_payload as table ( id int not null, - [vector] varbinary(8000) not null + [vector] vector({self.dim}) not null ) end """) @@ -109,22 +109,6 @@ def optimize(self): def ready_to_search(self): log.info(f"MSSQL ready to search") pass - - def array_to_vector(self, a:list[float]) -> bytearray: - # header - b = bytearray([169, 1]) - - # number of items - b += bytearray(struct.pack("i", len(a))) - pf = f"{len(a)}f" - - # filler - b += bytearray([0,0]) - - # items - b += bytearray(struct.pack(pf, *a)) - - return b def insert_embeddings( self, @@ -137,13 +121,13 @@ def insert_embeddings( #return len(metadata), None log.info(f'Generating param list...') - params = [(metadata[i], self.array_to_vector(embeddings[i])) for i in range(len(metadata))] + params = [(metadata[i], json.dumps(embeddings[i])) for i in range(len(metadata))] log.info(f'Loading table...') cursor = self.cursor #cursor.fast_executemany = True cursor.execute("EXEC dbo.stp_load_vectors @dummy=?, @payload=?", (1, params)) - + #cursor.executemany (f"insert into [{self.schema_name}].[{self.table_name}] (id, vector) values(?, ?);", params) return len(metadata), None except Exception as e: #cursor.rollback() From 118d00264f784fe7e7aee74b4f6ce161a9e4afaa Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Thu, 9 Jan 2025 23:06:31 +0000 Subject: [PATCH 38/64] completed basic vector type support --- vectordb_bench/backend/clients/mssql/mssql.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 49f086c91..bccfd49d1 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -124,10 +124,8 @@ def insert_embeddings( params = [(metadata[i], json.dumps(embeddings[i])) for i in range(len(metadata))] log.info(f'Loading table...') - cursor = self.cursor - #cursor.fast_executemany = True + cursor = self.cursor cursor.execute("EXEC dbo.stp_load_vectors @dummy=?, @payload=?", (1, params)) - #cursor.executemany (f"insert into [{self.schema_name}].[{self.table_name}] (id, vector) values(?, ?);", params) return len(metadata), None except Exception as e: #cursor.rollback() @@ -148,20 +146,20 @@ def search_embedding( cursor = self.cursor if filters: cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(?, cast(? as varchar({self.dim})), v.[vector]) """, k, int(filters.get('id')), metric_function, - self.array_to_vector(query) + json.dumps(query) ) else: cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(cast(? as varchar(20)), ?, v.[vector]) + select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(?, cast(? as vector({self.dim})), v.[vector]) """, k, metric_function, - self.array_to_vector(query) + json.dumps(query) ) rows = cursor.fetchall() res = [row.id for row in rows] From 54d86fcdc38c547b76980ef191d005bb52ab4528 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 10 Jan 2025 01:56:40 +0000 Subject: [PATCH 39/64] added vector index creation --- vectordb_bench/backend/clients/mssql/mssql.py | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index bccfd49d1..4723a9b27 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -26,11 +26,12 @@ def __init__( self.table_name = collection_name + "_" + str(dim) self.dim = dim self.schema_name = "benchmark" + self.drop_old = drop_old log.info("db_case_config: " + str(db_case_config)) log.info(f"Connecting to MSSQL...") - log.info(self.db_config['connection_string']) + #log.info(self.db_config['connection_string']) cnxn = pyodbc.connect(self.db_config['connection_string']) cursor = cnxn.cursor() @@ -94,9 +95,13 @@ def init(self) -> Generator[None, None, None]: self.cnxn = cnxn cnxn.autocommit = True self.cursor = cnxn.cursor() - yield - self.cursor.close() - self.cnxn.close() + try: + yield + finally: + self.cursor.close() + self.cnxn.close() + self.cursor = None + self.cnxn = None def ready_to_load(self): log.info(f"MSSQL ready to load") @@ -104,7 +109,20 @@ def ready_to_load(self): def optimize(self): log.info(f"MSSQL optimize") - pass + cursor = self.cursor + if self.drop_old: + cursor.execute(f""" + if exists(select * from sys.indexes where object_id = object_id('[{self.schema_name}].[{self.table_name}]') and type=8) + begin + drop index vec_idx on [{self.schema_name}].[{self.table_name}]; + end + """, + ) + + cursor.execute(f""" + create vector index vec_idx on [benchmark].[vector_1536]([vector]) with (metric = 'cosine', type = 'DiskANN'); + """ + ) def ready_to_search(self): log.info(f"MSSQL ready to search") From 6ec2ac395e33a4891a27eebc5bf11a9fd0b86a3b Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 10 Jan 2025 02:02:03 +0000 Subject: [PATCH 40/64] removed unneeded files --- sql/find_similar.sql | 26 -------------------------- sql/schema.sql | 1 - sql/stp_filter_similar.sql | 15 --------------- sql/stp_filter_similar_kmeans.sql | 30 ------------------------------ sql/stp_find_similar.sql | 12 ------------ sql/stp_find_similar_kmeans.sql | 27 --------------------------- 6 files changed, 111 deletions(-) delete mode 100644 sql/find_similar.sql delete mode 100644 sql/schema.sql delete mode 100644 sql/stp_filter_similar.sql delete mode 100644 sql/stp_filter_similar_kmeans.sql delete mode 100644 sql/stp_find_similar.sql delete mode 100644 sql/stp_find_similar_kmeans.sql diff --git a/sql/find_similar.sql b/sql/find_similar.sql deleted file mode 100644 index bb9155e80..000000000 --- a/sql/find_similar.sql +++ /dev/null @@ -1,26 +0,0 @@ -create or alter function [$vector].[find_similar$vector_768$vector] (@v varbinary(8000), @k int, @p int, @d float, @m varchar(50) = 'cosine') -returns table -as return -with cteProbes as -( - select top (@p) - k.cluster_id - from - [$vector].[vector_768$vector$clusters_centroids] k - order by - vector_distance(@m, k.[centroid], @v) -) -select top(@k) - v.*, - cosine_distance = vector_distance(@m, v.[vector], @v) -from - [$vector].[vector_768$vector$clusters] c -inner join - cteProbes k on k.cluster_id = c.cluster_id -inner join - [benchmark].[vector_768] v on v.id = c.item_id -where - vector_distance(@m, v.[vector], @v) <= @d -order by - cosine_distance - diff --git a/sql/schema.sql b/sql/schema.sql deleted file mode 100644 index bf4f7eefe..000000000 --- a/sql/schema.sql +++ /dev/null @@ -1 +0,0 @@ -create schema [$vector] authorization [dbo] diff --git a/sql/stp_filter_similar.sql b/sql/stp_filter_similar.sql deleted file mode 100644 index 6a962ca97..000000000 --- a/sql/stp_filter_similar.sql +++ /dev/null @@ -1,15 +0,0 @@ -create or alter procedure [$vector].[stp_filter_similar] -@id int, -@v varbinary(8000), -@k int, -@m varchar(50) = 'cosine' -as -select top(@k) - v.id -from - [benchmark].[vector_768] v -where - v.id > @id -order by - vector_distance(@m, @v, v.[vector]) -GO diff --git a/sql/stp_filter_similar_kmeans.sql b/sql/stp_filter_similar_kmeans.sql deleted file mode 100644 index 70306cd34..000000000 --- a/sql/stp_filter_similar_kmeans.sql +++ /dev/null @@ -1,30 +0,0 @@ -create or alter procedure [$vector].[stp_filter_similar$vector_768$vector] -@id int, -@v varbinary(8000), -@k int, -@p int, -@m varchar(50) = 'cosine' -as -with cteProbes as -( - select top (@p) - k.cluster_id - from - [$vector].[vector_768$vector$clusters_centroids] k - order by - vector_distance(@m, k.[centroid], @v) -) -select top(@k) - v.id -from - [$vector].[vector_768$vector$clusters] c -inner join - cteProbes k on k.cluster_id = c.cluster_id -inner join - [benchmark].[vector_768] v on v.id = c.item_id -where - v.id > @id -order by - vector_distance(@m, v.[vector], @v) - -GO diff --git a/sql/stp_find_similar.sql b/sql/stp_find_similar.sql deleted file mode 100644 index c587ea8d5..000000000 --- a/sql/stp_find_similar.sql +++ /dev/null @@ -1,12 +0,0 @@ -create or alter procedure [$vector].[stp_find_similar$vector_768$vector] -@v varbinary(8000), -@k int, -@m varchar(50) = 'cosine' -as -select top(@k) - v.id -from - [benchmark].[vector_768] v -order by - vector_distance(@m, @v, v.[vector]) -GO diff --git a/sql/stp_find_similar_kmeans.sql b/sql/stp_find_similar_kmeans.sql deleted file mode 100644 index 3d206e2f0..000000000 --- a/sql/stp_find_similar_kmeans.sql +++ /dev/null @@ -1,27 +0,0 @@ -create or alter procedure [$vector].[stp_find_similar$vector_768$vector] -@v varbinary(8000), -@k int, -@p int, -@m varchar(50) = 'cosine' -as -with cteProbes as -( - select top (@p) - k.cluster_id - from - [$vector].[vector_768$vector$clusters_centroids] k - order by - vector_distance(@m, k.[centroid], @v) -) -select top(@k) - v.id -from - [$vector].[vector_768$vector$clusters] c -inner join - cteProbes k on k.cluster_id = c.cluster_id -inner join - [benchmark].[vector_768] v on v.id = c.item_id -order by - vector_distance(@m, v.[vector], @v) - -GO From a92de0c92f487955c49849ae1b4bbe670cfd7ba9 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 10 Jan 2025 02:21:04 +0000 Subject: [PATCH 41/64] updated notes --- notes.txt => MSSQL-Notes.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) rename notes.txt => MSSQL-Notes.txt (98%) diff --git a/notes.txt b/MSSQL-Notes.txt similarity index 98% rename from notes.txt rename to MSSQL-Notes.txt index 0ea75f0b6..762ce5219 100644 --- a/notes.txt +++ b/MSSQL-Notes.txt @@ -2,7 +2,6 @@ create login benchmark with password = 'BOR23bSu0ZsDc9zR8wdHn2w0P5yoEbBD'; create user benchmark for login benchmark; alter role db_owner add member benchmark; - --- to connect to SQL Server from WSL2 get the hostname (cannot use "localhost") @@ -14,10 +13,9 @@ echo $(hostname).local LAPTOP-DM-2.local vectordb - export LOG_LEVEL="DEBUG" +export LOG_LEVEL="DEBUG" -- - install odbc if in linux https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server From 899ec321e05bff07ea57ecdbe25dc723a17bbf26 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 10 Jan 2025 02:21:11 +0000 Subject: [PATCH 42/64] fixed search queries --- vectordb_bench/backend/clients/mssql/mssql.py | 45 ++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 4723a9b27..26b7336a3 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -163,21 +163,44 @@ def search_embedding( #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: - cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(?, cast(? as varchar({self.dim})), v.[vector]) + # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(?, cast(? as varchar({self.dim})), v.[vector]) + cursor.execute(f""" + select + t.id + from + vector_search( + table = [{self.schema_name}].[{self.table_name}] AS t, + column = [vector], + similar_to = ?, + metric = '{metric_function}', + top_n = ? + ) AS s + where + v.id >= ? """, - k, - int(filters.get('id')), - metric_function, - json.dumps(query) + json.dumps(query), + k, + int(filters.get('id')), ) else: - cursor.execute(f""" - select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(?, cast(? as vector({self.dim})), v.[vector]) + # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(?, cast(? as vector({self.dim})), v.[vector]) + cursor.execute(f""" + declare @v vector({self.dim}) = ?; + select + t.id + from + vector_search( + table = [{self.schema_name}].[{self.table_name}] AS t, + column = [vector], + similar_to = @v, + metric = '{metric_function}', + top_n = ? + ) AS s + order by + t.id """, - k, - metric_function, - json.dumps(query) + json.dumps(query), + k, ) rows = cursor.fetchall() res = [row.id for row in rows] From eafc5e0ba65ed2c1b8f8904fe7cc5378dd730cb4 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 10 Jan 2025 02:38:03 +0000 Subject: [PATCH 43/64] updated notes --- MSSQL-Notes.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/MSSQL-Notes.txt b/MSSQL-Notes.txt index 762ce5219..b73b15025 100644 --- a/MSSQL-Notes.txt +++ b/MSSQL-Notes.txt @@ -43,6 +43,7 @@ python -m vectordb_bench export DROP_OLD=True|False export export NUM_PER_BATCH=50000 (default is 5000) +export NUM_CONCURRENCY=5,10,25,50 check windows firewall if cannot connect to sql From 0cca1d3bdcd584ba9196cc758c1474b9ef0fee40 Mon Sep 17 00:00:00 2001 From: Davide Mauri Date: Fri, 10 Jan 2025 20:38:20 +0000 Subject: [PATCH 44/64] removed hard-coded values --- vectordb_bench/backend/clients/mssql/mssql.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 26b7336a3..06a6f0715 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -107,8 +107,10 @@ def ready_to_load(self): log.info(f"MSSQL ready to load") pass - def optimize(self): + def optimize(self): log.info(f"MSSQL optimize") + search_param = self.case_config.search_param() + metric_function = search_param["metric"] cursor = self.cursor if self.drop_old: cursor.execute(f""" @@ -120,7 +122,7 @@ def optimize(self): ) cursor.execute(f""" - create vector index vec_idx on [benchmark].[vector_1536]([vector]) with (metric = 'cosine', type = 'DiskANN'); + create vector index vec_idx on [{self.schema_name}].[{self.table_name}]([vector]) with (metric = '{metric_function}', type = 'DiskANN'); """ ) From a7bf759444f24f1c023283b9410870a3fb64b1cc Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Wed, 15 Jan 2025 15:26:53 -0800 Subject: [PATCH 45/64] :Add CLI Support --- vectordb_bench/backend/clients/mssql/cli.py | 55 +++++++++++++++++++ vectordb_bench/backend/clients/mssql/mssql.py | 13 +++-- vectordb_bench/cli/vectordbbench.py | 23 -------- 3 files changed, 62 insertions(+), 29 deletions(-) create mode 100644 vectordb_bench/backend/clients/mssql/cli.py diff --git a/vectordb_bench/backend/clients/mssql/cli.py b/vectordb_bench/backend/clients/mssql/cli.py new file mode 100644 index 000000000..3ad421789 --- /dev/null +++ b/vectordb_bench/backend/clients/mssql/cli.py @@ -0,0 +1,55 @@ +from typing import Annotated, Unpack + +import click +from pydantic import SecretStr + +from ....cli.cli import ( + CommonTypedDict, + cli, + click_parameter_decorators_from_typed_dict, + run, +) +from .. import DB + + +class MSSQLTypedDict(CommonTypedDict): + server: Annotated[ + str, click.option("--server", type=str, help="server url", required=True) + ] + database: Annotated[ + str, + click.option("--database", type=str, help="database name", required=True), + ] + uid: Annotated[ + str, + click.option("--uid", type=str, help="User id", required=True), + ] + pwd: Annotated[ + str, + click.option("--pwd", type=str, help="user password", required=True), + ] + metric: Annotated[ + str, + click.option("--metric", type=str, help="distance metric", required=True), + ] + + +@cli.command() +@click_parameter_decorators_from_typed_dict(MSSQLTypedDict) +def MSSQL(**parameters: Unpack[MSSQLTypedDict]): + from .config import MSSQLConfig, MSSQLVectorIndexConfig + + run( + db=DB.MSSQL, + db_config=MSSQLConfig( + server=parameters["server"], + database=parameters["database"], + uid=parameters["uid"], + pwd=parameters["pwd"], + metric=parameters["metric"] + ), + db_case_config=MSSQLVectorIndexConfig( + + ), + **parameters, + ) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 06a6f0715..abb4d8d2c 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -32,7 +32,7 @@ def __init__( log.info(f"Connecting to MSSQL...") #log.info(self.db_config['connection_string']) - cnxn = pyodbc.connect(self.db_config['connection_string']) + cnxn = pyodbc.connect(self.db_config['connection_string'] + ';LongAsMax=yes;') cursor = cnxn.cursor() log.info(f"Creating schema...") @@ -81,7 +81,7 @@ def __init__( as begin set nocount on - insert into [{self.schema_name}].[{self.table_name}] (id, vector) select id, [vector] from @payload; + insert into [{self.schema_name}].[{self.table_name}] (id, [vector]) select id, [vector] from @payload; end """) cnxn.commit() @@ -91,7 +91,7 @@ def __init__( @contextmanager def init(self) -> Generator[None, None, None]: - cnxn = pyodbc.connect(self.db_config['connection_string']) + cnxn = pyodbc.connect(self.db_config['connection_string'] + ';LongAsMax=yes;') self.cnxn = cnxn cnxn.autocommit = True self.cursor = cnxn.cursor() @@ -151,7 +151,8 @@ def insert_embeddings( #cursor.rollback() log.warning(f"Failed to insert data into vector table ([{self.schema_name}].[{self.table_name}]), error: {e}") return 0, e - + + @staticmethod def search_embedding( self, query: list[float], @@ -160,7 +161,7 @@ def search_embedding( timeout: int | None = None, ) -> list[int]: search_param = self.case_config.search_param() - metric_function = search_param["metric"] + metric_function = 'euclidean' #search_param["metric"] #efSearch = search_param["efSearch"] #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor @@ -208,4 +209,4 @@ def search_embedding( res = [row.id for row in rows] return res - \ No newline at end of file + diff --git a/vectordb_bench/cli/vectordbbench.py b/vectordb_bench/cli/vectordbbench.py index 5e3798691..68fd7b08e 100644 --- a/vectordb_bench/cli/vectordbbench.py +++ b/vectordb_bench/cli/vectordbbench.py @@ -1,30 +1,7 @@ -from ..backend.clients.alloydb.cli import AlloyDBScaNN -from ..backend.clients.aws_opensearch.cli import AWSOpenSearch -from ..backend.clients.memorydb.cli import MemoryDB -from ..backend.clients.milvus.cli import MilvusAutoIndex -from ..backend.clients.pgdiskann.cli import PgDiskAnn -from ..backend.clients.pgvecto_rs.cli import PgVectoRSHNSW, PgVectoRSIVFFlat from ..backend.clients.pgvector.cli import PgVectorHNSW -from ..backend.clients.pgvectorscale.cli import PgVectorScaleDiskAnn -from ..backend.clients.redis.cli import Redis -from ..backend.clients.test.cli import Test -from ..backend.clients.weaviate_cloud.cli import Weaviate -from ..backend.clients.zilliz_cloud.cli import ZillizAutoIndex from .cli import cli cli.add_command(PgVectorHNSW) -cli.add_command(PgVectoRSHNSW) -cli.add_command(PgVectoRSIVFFlat) -cli.add_command(Redis) -cli.add_command(MemoryDB) -cli.add_command(Weaviate) -cli.add_command(Test) -cli.add_command(ZillizAutoIndex) -cli.add_command(MilvusAutoIndex) -cli.add_command(AWSOpenSearch) -cli.add_command(PgVectorScaleDiskAnn) -cli.add_command(PgDiskAnn) -cli.add_command(AlloyDBScaNN) if __name__ == "__main__": From 8253c5e0b1d3e4a0c88ecfbd25624f3bc5646612 Mon Sep 17 00:00:00 2001 From: "Josh Innis (Warner Marketing Incorporated)" Date: Wed, 22 Jan 2025 11:40:05 -0800 Subject: [PATCH 46/64] Remove static declaration of search_embedding from MSSQL --- vectordb_bench/backend/clients/mssql/mssql.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index abb4d8d2c..431bb1826 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -152,7 +152,6 @@ def insert_embeddings( log.warning(f"Failed to insert data into vector table ([{self.schema_name}].[{self.table_name}]), error: {e}") return 0, e - @staticmethod def search_embedding( self, query: list[float], From 10b363c6c77b5e6aa6af62f22a39384d6bdaf5bc Mon Sep 17 00:00:00 2001 From: "Josh Innis (Warner Marketing Incorporated)" Date: Wed, 22 Jan 2025 11:48:23 -0800 Subject: [PATCH 47/64] Add the CLI Configurations --- vectordb_bench/backend/clients/mssql/cli.py | 6 +----- vectordb_bench/cli/vectordbbench.py | 23 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/cli.py b/vectordb_bench/backend/clients/mssql/cli.py index 3ad421789..9d60e270d 100644 --- a/vectordb_bench/backend/clients/mssql/cli.py +++ b/vectordb_bench/backend/clients/mssql/cli.py @@ -28,10 +28,7 @@ class MSSQLTypedDict(CommonTypedDict): str, click.option("--pwd", type=str, help="user password", required=True), ] - metric: Annotated[ - str, - click.option("--metric", type=str, help="distance metric", required=True), - ] + @cli.command() @@ -46,7 +43,6 @@ def MSSQL(**parameters: Unpack[MSSQLTypedDict]): database=parameters["database"], uid=parameters["uid"], pwd=parameters["pwd"], - metric=parameters["metric"] ), db_case_config=MSSQLVectorIndexConfig( diff --git a/vectordb_bench/cli/vectordbbench.py b/vectordb_bench/cli/vectordbbench.py index 68fd7b08e..cadbaa04b 100644 --- a/vectordb_bench/cli/vectordbbench.py +++ b/vectordb_bench/cli/vectordbbench.py @@ -1,7 +1,30 @@ +from ..backend.clients.alloydb.cli import AlloyDBScaNN +from ..backend.clients.aws_opensearch.cli import AWSOpenSearch +from ..backend.clients.milvus.cli import MilvusAutoIndex +from ..backend.clients.pgdiskann.cli import PgDiskAnn +from ..backend.clients.pgvecto_rs.cli import PgVectoRSHNSW, PgVectoRSIVFFlat from ..backend.clients.pgvector.cli import PgVectorHNSW +from ..backend.clients.pgvectorscale.cli import PgVectorScaleDiskAnn +from ..backend.clients.redis.cli import Redis +from ..backend.clients.test.cli import Test +from ..backend.clients.weaviate_cloud.cli import Weaviate +from ..backend.clients.zilliz_cloud.cli import ZillizAutoIndex +from ..backend.clients.mssql.cli import MSSQL from .cli import cli cli.add_command(PgVectorHNSW) +cli.add_command(PgVectoRSHNSW) +cli.add_command(PgVectoRSIVFFlat) +cli.add_command(Redis) +cli.add_command(Weaviate) +cli.add_command(Test) +cli.add_command(ZillizAutoIndex) +cli.add_command(MilvusAutoIndex) +cli.add_command(AWSOpenSearch) +cli.add_command(PgVectorScaleDiskAnn) +cli.add_command(PgDiskAnn) +cli.add_command(AlloyDBScaNN) +cli.add_command(MSSQL) if __name__ == "__main__": From 5a294232bf66d8f263511ec223726a9e531d678e Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Tue, 4 Feb 2025 16:45:18 -0800 Subject: [PATCH 48/64] Change CREATE VECTOR INDEX to build with Euclidean Distance --- vectordb_bench/backend/clients/mssql/mssql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 431bb1826..d81aaafb9 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -122,7 +122,7 @@ def optimize(self): ) cursor.execute(f""" - create vector index vec_idx on [{self.schema_name}].[{self.table_name}]([vector]) with (metric = '{metric_function}', type = 'DiskANN'); + create vector index vec_idx on [{self.schema_name}].[{self.table_name}]([vector]) with (metric = 'euclidean', type = 'DiskANN'); """ ) From 5393168beb8c0ea13ead343aa3272538e4b88830 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Tue, 18 Feb 2025 11:52:39 -0800 Subject: [PATCH 49/64] Drop the vector table type and Stored Procedure If attempting to load 2 datasets with different vector dimensions to the same database, a schema error will occur because the vector type is set to a specific dimension. --- vectordb_bench/backend/clients/mssql/mssql.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index d81aaafb9..46b388d27 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -50,6 +50,7 @@ def __init__( """) cnxn.commit() + log.info(f"Creating vector table...") cursor.execute(f""" if object_id('[{self.schema_name}].[{self.table_name}]') is null begin @@ -60,7 +61,14 @@ def __init__( end """) cnxn.commit() - + + log.info(f"Dropping old loading vector table type and stored procedure") + cursor.execute(f""" + drop procedure if exists stp_load_vectors + drop type if exists dbo.vector_payload + """) + cnxn.commit() + log.info(f"Creating table type...") cursor.execute(f""" if type_id('dbo.vector_payload') is null begin @@ -160,9 +168,8 @@ def search_embedding( timeout: int | None = None, ) -> list[int]: search_param = self.case_config.search_param() - metric_function = 'euclidean' #search_param["metric"] + metric_function = search_param["metric"] #efSearch = search_param["efSearch"] - #log.info(f'Query top:{k} metric:{metric_fun} filters:{filters} params: {search_param} timeout:{timeout}...') cursor = self.cursor if filters: # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(?, cast(? as varchar({self.dim})), v.[vector]) From 592bde3e2280a988abf9b96c431e60376a826c7b Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Tue, 18 Feb 2025 12:21:23 -0800 Subject: [PATCH 50/64] Remove hard coded metric type Using Euclidean distancefor all datasets was a relic from testing a development build. --- vectordb_bench/backend/clients/mssql/mssql.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 46b388d27..a672aa871 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -32,7 +32,7 @@ def __init__( log.info(f"Connecting to MSSQL...") #log.info(self.db_config['connection_string']) - cnxn = pyodbc.connect(self.db_config['connection_string'] + ';LongAsMax=yes;') + cnxn = pyodbc.connect(self.db_config['connection_string']) cursor = cnxn.cursor() log.info(f"Creating schema...") @@ -99,7 +99,7 @@ def __init__( @contextmanager def init(self) -> Generator[None, None, None]: - cnxn = pyodbc.connect(self.db_config['connection_string'] + ';LongAsMax=yes;') + cnxn = pyodbc.connect(self.db_config['connection_string']) self.cnxn = cnxn cnxn.autocommit = True self.cursor = cnxn.cursor() @@ -130,7 +130,7 @@ def optimize(self): ) cursor.execute(f""" - create vector index vec_idx on [{self.schema_name}].[{self.table_name}]([vector]) with (metric = 'euclidean', type = 'DiskANN'); + create vector index vec_idx on [{self.schema_name}].[{self.table_name}]([vector]) with (metric = '{metric_function}', type = 'DiskANN'); """ ) From 52527eb464eef443bd3aa4243e36bea2d0596525 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Tue, 15 Apr 2025 09:26:15 -0700 Subject: [PATCH 51/64] Update MSSQL-Setup.md --- MSSQL-Setup.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/MSSQL-Setup.md b/MSSQL-Setup.md index bfb1f397f..fb80ec595 100644 --- a/MSSQL-Setup.md +++ b/MSSQL-Setup.md @@ -1,4 +1,4 @@ -# Run VectorDBBench agains MSSQL database +# Run VectorDBBench against MSSQL database VectorDBBench has been tested running on WSL2 + Ubuntu 22.04.4 LTS. @@ -23,20 +23,6 @@ curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 Clone the repository into a local folder -## Create Virtual Environment - -In local folder where you have cloned the repository, create a virtual environment: - -``` -python3.11 -m venv .venv -``` - -then activate it: - -``` -. ./.venv/bin/activate -``` - ## Install VectorDBBench dependencies Install the VectorDBBench dependencies From 8a317457b99a1185d65efd586f462d061c93ef84 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Fri, 18 Apr 2025 14:17:36 -0700 Subject: [PATCH 52/64] Update MSSQL-Setup.md --- MSSQL-Setup.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/MSSQL-Setup.md b/MSSQL-Setup.md index fb80ec595..70ca287a5 100644 --- a/MSSQL-Setup.md +++ b/MSSQL-Setup.md @@ -21,6 +21,10 @@ curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 ## Clone the repository +``` +git clone https://github.com/MSSQL-VectorDBBench/VectorDBBench +``` + Clone the repository into a local folder ## Install VectorDBBench dependencies @@ -32,8 +36,20 @@ pip install -e '.[test]' pip install -e '.[mssql]' ``` + +## Run VectorDBBench with help + +``` +vectordbbench mssql --help +``` + ## Run VectorDBBench +``` +vectordbbench mssql --database=vectordb --server=10.177.3.78 --uid=sa --pwd=--concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K --skip-load --skip-drop-old + +``` +## Start the Server ``` python -m vectordb_bench ``` From e7e3cdc23acc05bf16c1dfa02a184199b6de0e63 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Fri, 18 Apr 2025 14:25:30 -0700 Subject: [PATCH 53/64] Rename MSSQL-Setup.md to README-MSSQLmd --- MSSQL-Setup.md => README-MSSQLmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename MSSQL-Setup.md => README-MSSQLmd (100%) diff --git a/MSSQL-Setup.md b/README-MSSQLmd similarity index 100% rename from MSSQL-Setup.md rename to README-MSSQLmd From edcda67655e146fea583a0016ade9119797275c6 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Fri, 18 Apr 2025 14:26:06 -0700 Subject: [PATCH 54/64] Delete MSSQL-Notes.txt --- MSSQL-Notes.txt | 62 ------------------------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 MSSQL-Notes.txt diff --git a/MSSQL-Notes.txt b/MSSQL-Notes.txt deleted file mode 100644 index b73b15025..000000000 --- a/MSSQL-Notes.txt +++ /dev/null @@ -1,62 +0,0 @@ -create login benchmark with password = 'BOR23bSu0ZsDc9zR8wdHn2w0P5yoEbBD'; -create user benchmark for login benchmark; -alter role db_owner add member benchmark; - ---- - -to connect to SQL Server from WSL2 get the hostname (cannot use "localhost") - -echo $(hostname).local - ---- - -LAPTOP-DM-2.local -vectordb - -export LOG_LEVEL="DEBUG" --- - -install odbc if in linux - -https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server - -install python 3.11 - -https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/ - -instal python3.11 pip - -apt install python3.11 python3.11-distutils python3.11-venv -curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - -create a folder - -python3.11 -m venv .venv - -. ./.venv/bin/activate - -pip install -e '.[test]' - -pip install -e '.[mssql]' - -python -m vectordb_bench - -export DROP_OLD=True|False -export export NUM_PER_BATCH=50000 (default is 5000) -export NUM_CONCURRENCY=5,10,25,50 - -check windows firewall if cannot connect to sql - - -select used_page_count * 8. / 1024. / 1024. as size_in_gb, index_id, row_count, used_page_count from sys.dm_db_partition_stats -where object_id = object_id('[benchmark].[vector_768]') - ----- - -declare @v varbinary(8000) -select @v = vector from benchmark.vector_768 where id = 1224 - -select id from [$vector].[find_similar$vector_768$vector](@v, 100, 30, 1, 'cosine') ---exec dbo.stp_kmeans_search @p=30, @k=10, @v=@v - ---exec dbo.stp_kmeans_search @p=1000, @k=10, @v=@v From 55d6f3684c96ac34d7c4ed9c25b79412e2ccfde5 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Fri, 18 Apr 2025 14:38:00 -0700 Subject: [PATCH 55/64] Update README-MSSQLmd --- README-MSSQLmd | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/README-MSSQLmd b/README-MSSQLmd index 70ca287a5..23a17496f 100644 --- a/README-MSSQLmd +++ b/README-MSSQLmd @@ -37,19 +37,28 @@ pip install -e '.[mssql]' ``` -## Run VectorDBBench with help +## Run VectorDBBench on the Command Line Interface with help ``` vectordbbench mssql --help ``` -## Run VectorDBBench +## Run VectorDBBench on the Command Line Interface +The database must exist and there must be enough room to build the index + +``` +vectordbbench mssql --database=vectordb --server=**IP_ADDRESS** --uid=sa --pwd=**PASSWORD_HERE** --concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K + +``` + +## Run VectorDBBench on the Command Line Interface with Existing Data + ``` -vectordbbench mssql --database=vectordb --server=10.177.3.78 --uid=sa --pwd=--concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K --skip-load --skip-drop-old +vectordbbench mssql --database=vectordb --server=**IP_ADDRESS** --uid=sa --pwd=**PASSWORD_HERE** --concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K ``` -## Start the Server +## Start VectorDBBench in the GUI Mode ``` python -m vectordb_bench ``` From 92c8902564b0b85c400001c83cbc17a38d2e40f5 Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Fri, 18 Apr 2025 14:38:46 -0700 Subject: [PATCH 56/64] Update README-MSSQLmd --- README-MSSQLmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/README-MSSQLmd b/README-MSSQLmd index 23a17496f..0a3a4744a 100644 --- a/README-MSSQLmd +++ b/README-MSSQLmd @@ -48,14 +48,12 @@ The database must exist and there must be enough room to build the index ``` vectordbbench mssql --database=vectordb --server=**IP_ADDRESS** --uid=sa --pwd=**PASSWORD_HERE** --concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K - ``` ## Run VectorDBBench on the Command Line Interface with Existing Data ``` vectordbbench mssql --database=vectordb --server=**IP_ADDRESS** --uid=sa --pwd=**PASSWORD_HERE** --concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K - ``` ## Start VectorDBBench in the GUI Mode From 168df35c7de7351e731c565d60eea07e9c4726ca Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Mon, 21 Apr 2025 12:11:49 -0700 Subject: [PATCH 57/64] Update README-MSSQLmd --- README-MSSQLmd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README-MSSQLmd b/README-MSSQLmd index 0a3a4744a..f9b0ba13f 100644 --- a/README-MSSQLmd +++ b/README-MSSQLmd @@ -29,11 +29,12 @@ Clone the repository into a local folder ## Install VectorDBBench dependencies -Install the VectorDBBench dependencies +Change directoies into VectorDBBench and Install the VectorDBBench and its dependencies ``` -pip install -e '.[test]' -pip install -e '.[mssql]' +cd VectorDBBench +pip install pyodbc +pip install . ``` From e4ee845fea91d0fb78ef7484108451290f6c240b Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Tue, 22 Apr 2025 11:00:31 -0700 Subject: [PATCH 58/64] Fix formatting bug in logging when loading table The logging of Vector Insert time had a bug where python would throw a warning due to a formatting issue. --- vectordb_bench/backend/runner/mp_runner.py | 6 ++---- vectordb_bench/backend/runner/serial_runner.py | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vectordb_bench/backend/runner/mp_runner.py b/vectordb_bench/backend/runner/mp_runner.py index 5b69b5481..81c7ef01a 100644 --- a/vectordb_bench/backend/runner/mp_runner.py +++ b/vectordb_bench/backend/runner/mp_runner.py @@ -79,14 +79,12 @@ def search( if count % 500 == 0: log.debug( - f"({mp.current_process().name:16}) ", - f"search_count: {count}, latest_latency={time.perf_counter()-s}", + f"({mp.current_process().name:16}) search_count: {count}, latest_latency={time.perf_counter()-s}", ) total_dur = round(time.perf_counter() - start_time, 4) log.info( - f"{mp.current_process().name:16} search {self.duration}s: " - f"actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}", + f"{mp.current_process().name:16} search {self.duration}s: actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}", ) return (count, total_dur, latencies) diff --git a/vectordb_bench/backend/runner/serial_runner.py b/vectordb_bench/backend/runner/serial_runner.py index 7eb59432b..760036f0f 100644 --- a/vectordb_bench/backend/runner/serial_runner.py +++ b/vectordb_bench/backend/runner/serial_runner.py @@ -71,8 +71,7 @@ def task(self) -> int: ) log.info( - f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, ", - f"dur={time.perf_counter()-start}", + f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, dur={time.perf_counter()-start}", ) return count From 013dd5d467687f30b169f7400917cbd4b7a8ac3c Mon Sep 17 00:00:00 2001 From: Josh Innis Date: Tue, 22 Apr 2025 14:22:29 -0700 Subject: [PATCH 59/64] Fix Environs Package Since this project was developed, the API for the environs package has changed. Add a rule to pyproject.toml to use older versions of the package to not break the API. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a910a800e..92fc26d5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "psutil", "polars", "plotly", - "environs", + "environs<14.0.1", "pydantic Date: Tue, 22 Apr 2025 15:07:17 -0700 Subject: [PATCH 60/64] Report p50 and p95 latencies in Serial Search Report the latencies for p50 and p95 and not just p99 in the serial search phase. --- vectordb_bench/backend/clients/mssql/mssql.py | 2 +- vectordb_bench/backend/runner/serial_runner.py | 8 ++++++-- vectordb_bench/backend/task_runner.py | 2 +- vectordb_bench/metric.py | 2 ++ vectordb_bench/models.py | 8 +++++++- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index a672aa871..79bd7f305 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -44,7 +44,7 @@ def __init__( cnxn.commit() if drop_old: - log.info(f"Dropping existing table...") + log.info(f"Dropping existing table... drop table if exists [{self.schema_name}].[{self.table_name}] ") cursor.execute(f""" drop table if exists [{self.schema_name}].[{self.table_name}] """) diff --git a/vectordb_bench/backend/runner/serial_runner.py b/vectordb_bench/backend/runner/serial_runner.py index 760036f0f..63c17e7a8 100644 --- a/vectordb_bench/backend/runner/serial_runner.py +++ b/vectordb_bench/backend/runner/serial_runner.py @@ -250,6 +250,9 @@ def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]: avg_ndcg = round(np.mean(ndcgs), 4) cost = round(np.sum(latencies), 4) p99 = round(np.percentile(latencies, 99), 4) + p95 = round(np.percentile(latencies, 95), 4) + p50 = round(np.percentile(latencies, 50), 4) + log.info( f"{mp.current_process().name:14} search entire test_data: " f"cost={cost}s, " @@ -257,9 +260,10 @@ def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]: f"avg_recall={avg_recall}, " f"avg_ndcg={avg_ndcg}," f"avg_latency={avg_latency}, " - f"p99={p99}", + f"p99={p99}, p95={p95}, p50={p50}", ) - return (avg_recall, avg_ndcg, p99) + log.info(p95) + return (avg_recall, avg_ndcg, p99, p95, p50) def _run_in_subprocess(self) -> tuple[float, float]: with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: diff --git a/vectordb_bench/backend/task_runner.py b/vectordb_bench/backend/task_runner.py index e24d74f03..4af2169b9 100644 --- a/vectordb_bench/backend/task_runner.py +++ b/vectordb_bench/backend/task_runner.py @@ -196,7 +196,7 @@ def _run_perf_case(self, drop_old: bool = True) -> Metric: m.recall = search_results.recall m.serial_latencies = search_results.serial_latencies """ - m.recall, m.ndcg, m.serial_latency_p99 = search_results + m.recall, m.ndcg, m.serial_latency_p99, m.serial_latency_p95, m.serial_latency_p50 = search_results except Exception as e: log.warning(f"Failed to run performance case, reason = {e}") diff --git a/vectordb_bench/metric.py b/vectordb_bench/metric.py index e0b6cff0e..ae994925f 100644 --- a/vectordb_bench/metric.py +++ b/vectordb_bench/metric.py @@ -17,6 +17,8 @@ class Metric: load_duration: float = 0.0 # duration to load all dataset into DB qps: float = 0.0 serial_latency_p99: float = 0.0 + serial_latency_p95: float = 0.0 + serial_latency_p50: float = 0.0 recall: float = 0.0 ndcg: float = 0.0 conc_num_list: list[int] = field(default_factory=list) diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 49bb04ae0..7a0ef76d7 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -295,6 +295,8 @@ def append_return(x: any, y: any): max_load_dur, max_qps, 15, + 15, + 15, max_recall, 14, 5, @@ -302,7 +304,7 @@ def append_return(x: any, y: any): DATA_FORMAT = ( f"%-{max_db}s | %-{max_db_labels}s %-{max_case}s %-{len(self.task_label)}s" - f" | %-{max_load_dur}s %-{max_qps}s %-15s %-{max_recall}s %-14s" + f" | %-{max_load_dur}s %-{max_qps}s %-15s %-15s %-15s %-{max_recall}s %-14s" f" | %-5s" ) @@ -314,6 +316,8 @@ def append_return(x: any, y: any): "load_dur", "qps", "latency(p99)", + "latency(p95)", + "latency(p50)", "recall", "max_load_count", "label", @@ -336,6 +340,8 @@ def append_return(x: any, y: any): f.metrics.load_duration, f.metrics.qps, f.metrics.serial_latency_p99, + f.metrics.serial_latency_p95, + f.metrics.serial_latency_p50, f.metrics.recall, f.metrics.max_load_count, f.label.value, From b19d0a42f2c6929d1af06d54584dff9b1ecb4c7b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 27 Aug 2025 12:13:42 +0000 Subject: [PATCH 61/64] Support EntraId Add the ability to use EntraId authentication to login to SQL Server --- pyproject.toml | 2 +- vectordb_bench/backend/clients/mssql/cli.py | 9 ++- .../backend/clients/mssql/config.py | 74 +++++++++++++++++-- vectordb_bench/backend/clients/mssql/mssql.py | 10 ++- 4 files changed, 84 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 92fc26d5d..3b0d89914 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,6 @@ authors = [ {name="XuanYang-cn", email="xuan.yang@zilliz.com"}, ] description = "VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze." - readme = "README.md" requires-python = ">=3.11" classifiers = [ @@ -23,6 +22,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] + dependencies = [ "click", "pytz", diff --git a/vectordb_bench/backend/clients/mssql/cli.py b/vectordb_bench/backend/clients/mssql/cli.py index 9d60e270d..e72014e8c 100644 --- a/vectordb_bench/backend/clients/mssql/cli.py +++ b/vectordb_bench/backend/clients/mssql/cli.py @@ -22,11 +22,15 @@ class MSSQLTypedDict(CommonTypedDict): ] uid: Annotated[ str, - click.option("--uid", type=str, help="User id", required=True), + click.option("--uid", type=str, help="User id", required=False), ] pwd: Annotated[ str, - click.option("--pwd", type=str, help="user password", required=True), + click.option("--pwd", type=str, help="user password", required=False), + ] + entraid: Annotated[ + str, + click.option("--entraid", type=str, help="Entra Id Authentication", required=False), ] @@ -43,6 +47,7 @@ def MSSQL(**parameters: Unpack[MSSQLTypedDict]): database=parameters["database"], uid=parameters["uid"], pwd=parameters["pwd"], + entraid=parameters["entraid"] ), db_case_config=MSSQLVectorIndexConfig( diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index 53990dfa5..0f892973e 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -1,18 +1,80 @@ +import pyodbc +import struct +from azure.identity import ManagedIdentityCredential from pydantic import BaseModel, SecretStr +from typing import Optional from ..api import DBConfig, DBCaseConfig, MetricType - MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;TrustServerCertificate=Yes" +MSSQL_ENTRA_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;AUTHENTICATION=ActiveDirectoryMsi;UID=%s;LongAsMax=yes;Connect Timeout=30;Encrypt=yes;TrustServerCertificate=Yes" + +#MSSQL_ENTRA_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;AUTHENTICATION=ActiveDirectoryServicePrincipal;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;Encrypt=yes;TrustServerCertificate=No" + +# --- Constants for Token Authentication --- +SQL_COPT_SS_ACCESS_TOKEN = 1256 +SQL_SERVER_TOKEN_SCOPE = "https://database.windows.net/.default" + +# --- Your Modified MSSQLConfig Class --- + class MSSQLConfig(DBConfig): server: str database: str - uid: str - pwd: SecretStr + uid: Optional[str] = None + pwd: Optional[SecretStr] = None + entraid: Optional[str] = None def to_dict(self) -> dict: - pwd_str = self.pwd.get_secret_value() + """ + Prepares connection parameters. If entraid is provided, it fetches a token + manually and returns connection attributes for pyodbc. + """ + # --- Case 1: Standard SQL Authentication --- + if self.entraid is None: + if not self.uid or not self.pwd: + raise ValueError("UID and PWD must be provided for standard SQL auth.") + + pwd_str = self.pwd.get_secret_value() + connection_string = ( + f"DRIVER={{ODBC Driver 18 for SQL Server}};" + f"SERVER={self.server};" + f"DATABASE={self.database};" + f"UID={self.uid};" + f"PWD={pwd_str};" + "LongAsMax=yes;" + "Connect Timeout=30;" + "Encrypt=yes;" + "TrustServerCertificate=Yes" + ) + return {"connection_string": connection_string} + + # --- Case 2: Entra ID Managed Identity (Manual Token Auth) --- + print(f"Attempting to get token for User-Assigned Identity: {self.entraid}") + + # 1. Get credentials and token using azure-identity + credential = ManagedIdentityCredential(client_id=self.entraid) + access_token = credential.get_token(SQL_SERVER_TOKEN_SCOPE) + token_bytes = access_token.token.encode("UTF-16-LE") + + # 2. Pack the token for the driver + token_struct = struct.pack(f' dict: return { "efSearch" : self.efSearch, "metric" : self.parse_metric() - } \ No newline at end of file + } diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py index 79bd7f305..53dee7f29 100644 --- a/vectordb_bench/backend/clients/mssql/mssql.py +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -32,7 +32,10 @@ def __init__( log.info(f"Connecting to MSSQL...") #log.info(self.db_config['connection_string']) - cnxn = pyodbc.connect(self.db_config['connection_string']) + cnxn = pyodbc.connect( + self.db_config.get("connection_string"), + attrs_before=self.db_config.get("attrs_before") + ) cursor = cnxn.cursor() log.info(f"Creating schema...") @@ -99,7 +102,10 @@ def __init__( @contextmanager def init(self) -> Generator[None, None, None]: - cnxn = pyodbc.connect(self.db_config['connection_string']) + cnxn = pyodbc.connect( + self.db_config.get("connection_string"), + attrs_before=self.db_config.get("attrs_before") + ) self.cnxn = cnxn cnxn.autocommit = True self.cursor = cnxn.cursor() From f93379f0869f502a8a453d8d420bdf88dea24877 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 2 Sep 2025 15:49:34 +0000 Subject: [PATCH 62/64] Cleanup Connection Strings --- vectordb_bench/backend/clients/mssql/config.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index 0f892973e..dec76da39 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -4,11 +4,8 @@ from pydantic import BaseModel, SecretStr from typing import Optional from ..api import DBConfig, DBCaseConfig, MetricType -MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;TrustServerCertificate=Yes" - -MSSQL_ENTRA_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;AUTHENTICATION=ActiveDirectoryMsi;UID=%s;LongAsMax=yes;Connect Timeout=30;Encrypt=yes;TrustServerCertificate=Yes" -#MSSQL_ENTRA_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;AUTHENTICATION=ActiveDirectoryServicePrincipal;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;Encrypt=yes;TrustServerCertificate=No" +MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;TrustServerCertificate=Yes" # --- Constants for Token Authentication --- SQL_COPT_SS_ACCESS_TOKEN = 1256 From 6edf66896f1d4caa81d9274748cdba5eb45af340 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 2 Sep 2025 15:52:45 +0000 Subject: [PATCH 63/64] Change logging and error checking to logging library The rest of the project uses the logging library to report info and errors. Switch the the changes on this branch to use the logging library for these purposes too. --- vectordb_bench/backend/clients/mssql/config.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py index dec76da39..7e2321138 100644 --- a/vectordb_bench/backend/clients/mssql/config.py +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -1,10 +1,13 @@ import pyodbc import struct +import logging from azure.identity import ManagedIdentityCredential from pydantic import BaseModel, SecretStr from typing import Optional from ..api import DBConfig, DBCaseConfig, MetricType +log = logging.getLogger(__name__) + MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;TrustServerCertificate=Yes" # --- Constants for Token Authentication --- @@ -28,7 +31,7 @@ def to_dict(self) -> dict: # --- Case 1: Standard SQL Authentication --- if self.entraid is None: if not self.uid or not self.pwd: - raise ValueError("UID and PWD must be provided for standard SQL auth.") + log.error("UID and PWD must be provided for standard SQL auth.") pwd_str = self.pwd.get_secret_value() connection_string = ( @@ -45,7 +48,7 @@ def to_dict(self) -> dict: return {"connection_string": connection_string} # --- Case 2: Entra ID Managed Identity (Manual Token Auth) --- - print(f"Attempting to get token for User-Assigned Identity: {self.entraid}") + log.info(f"Attempting to get token for User-Assigned Identity: {self.entraid}") # 1. Get credentials and token using azure-identity credential = ManagedIdentityCredential(client_id=self.entraid) @@ -55,7 +58,7 @@ def to_dict(self) -> dict: # 2. Pack the token for the driver token_struct = struct.pack(f' Date: Fri, 17 Oct 2025 19:20:57 +0000 Subject: [PATCH 64/64] Add Azure-identity to pyproject.toml --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3b0d89914..7a08114d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,8 @@ all = [ "memorydb", "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025", - "pyodbc" + "pyodbc", + "azure-identity" ] qdrant = [ "qdrant-client" ] @@ -86,7 +87,7 @@ memorydb = [ "memorydb" ] chromadb = [ "chromadb" ] opensearch = [ "opensearch-py" ] aliyun_opensearch = [ "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025"] -mssql = [ "pyodbc" ] +mssql = [ "pyodbc", "azure-identity" ] [project.urls] "repository" = "https://github.com/zilliztech/VectorDBBench"