diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..8efbe82d5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,13 @@ +# Thanks to: https://rehansaeed.com/gitattributes-best-practices/ + +# Set default behavior to automatically normalize line endings. +* text=auto + +# Force batch scripts to always use CRLF line endings so that if a repo is accessed +# in Windows via a file share from Linux, the scripts will work. +*.{cmd,[cC][mM][dD]} text eol=crlf +*.{bat,[bB][aA][tT]} text eol=crlf + +# Force bash scripts to always use LF line endings so that if a repo is accessed +# in Unix via a file share from Windows, the scripts will work. +*.sh text eol=lf \ No newline at end of file diff --git a/.gitignore b/.gitignore index f8602c11a..9bb88eb68 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ __MACOSX build/ venv/ .idea/ -results/ \ No newline at end of file +results/ +.venv/ \ No newline at end of file diff --git a/README-MSSQLmd b/README-MSSQLmd new file mode 100644 index 000000000..f9b0ba13f --- /dev/null +++ b/README-MSSQLmd @@ -0,0 +1,63 @@ +# Run VectorDBBench against MSSQL database + +VectorDBBench has been tested running on WSL2 + Ubuntu 22.04.4 LTS. + +## Install ODBC + +Follow instructions here: https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server + +## Install Python 3.11 + +Follow instructions here: https://ubuntuhandbook.org/index.php/2022/10/python-3-11-released-how-install-ubuntu/) + +## Install pip for Python3.11 : + +Use the following commands: + +``` +sudo apt install python3.11 python3.11-distutils python3.11-venv +curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 +``` + +## Clone the repository + +``` +git clone https://github.com/MSSQL-VectorDBBench/VectorDBBench +``` + +Clone the repository into a local folder + +## Install VectorDBBench dependencies + +Change directoies into VectorDBBench and Install the VectorDBBench and its dependencies + +``` +cd VectorDBBench +pip install pyodbc +pip install . +``` + + +## Run VectorDBBench on the Command Line Interface with help + +``` +vectordbbench mssql --help +``` + +## Run VectorDBBench on the Command Line Interface +The database must exist and there must be enough room to build the index + +``` +vectordbbench mssql --database=vectordb --server=**IP_ADDRESS** --uid=sa --pwd=**PASSWORD_HERE** --concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K +``` + +## Run VectorDBBench on the Command Line Interface with Existing Data + +``` +vectordbbench mssql --database=vectordb --server=**IP_ADDRESS** --uid=sa --pwd=**PASSWORD_HERE** --concurrency-duration=1800 --skip-search-concurrent --case-type=Performance1536D500K +``` + +## Start VectorDBBench in the GUI Mode +``` +python -m vectordb_bench +``` diff --git a/pyproject.toml b/pyproject.toml index 312940634..7a08114d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,6 @@ authors = [ {name="XuanYang-cn", email="xuan.yang@zilliz.com"}, ] description = "VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze." - readme = "README.md" requires-python = ">=3.11" classifiers = [ @@ -23,6 +22,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] + dependencies = [ "click", "pytz", @@ -35,7 +35,7 @@ dependencies = [ "psutil", "polars", "plotly", - "environs", + "environs<14.0.1", "pydantic type[VectorDB]: # noqa: PLR0911, PLR0912 """Import while in use""" + if self == DB.MSSQL: + from .mssql.mssql import MSSQL + return MSSQL + if self == DB.Milvus: from .milvus.milvus import Milvus @@ -135,6 +140,10 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912 @property def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912 """Import while in use""" + if self == DB.MSSQL: + from .mssql.config import MSSQLConfig + return MSSQLConfig + if self == DB.Milvus: from .milvus.config import MilvusConfig @@ -227,6 +236,11 @@ def case_config_cls( # noqa: PLR0911 self, index_type: IndexType | None = None, ) -> type[DBCaseConfig]: + + if self == DB.MSSQL: + from .mssql.config import MSSQLVectorIndexConfig + return MSSQLVectorIndexConfig + if self == DB.Milvus: from .milvus.config import _milvus_case_config diff --git a/vectordb_bench/backend/clients/mssql/cli.py b/vectordb_bench/backend/clients/mssql/cli.py new file mode 100644 index 000000000..e72014e8c --- /dev/null +++ b/vectordb_bench/backend/clients/mssql/cli.py @@ -0,0 +1,56 @@ +from typing import Annotated, Unpack + +import click +from pydantic import SecretStr + +from ....cli.cli import ( + CommonTypedDict, + cli, + click_parameter_decorators_from_typed_dict, + run, +) +from .. import DB + + +class MSSQLTypedDict(CommonTypedDict): + server: Annotated[ + str, click.option("--server", type=str, help="server url", required=True) + ] + database: Annotated[ + str, + click.option("--database", type=str, help="database name", required=True), + ] + uid: Annotated[ + str, + click.option("--uid", type=str, help="User id", required=False), + ] + pwd: Annotated[ + str, + click.option("--pwd", type=str, help="user password", required=False), + ] + entraid: Annotated[ + str, + click.option("--entraid", type=str, help="Entra Id Authentication", required=False), + ] + + + +@cli.command() +@click_parameter_decorators_from_typed_dict(MSSQLTypedDict) +def MSSQL(**parameters: Unpack[MSSQLTypedDict]): + from .config import MSSQLConfig, MSSQLVectorIndexConfig + + run( + db=DB.MSSQL, + db_config=MSSQLConfig( + server=parameters["server"], + database=parameters["database"], + uid=parameters["uid"], + pwd=parameters["pwd"], + entraid=parameters["entraid"] + ), + db_case_config=MSSQLVectorIndexConfig( + + ), + **parameters, + ) diff --git a/vectordb_bench/backend/clients/mssql/config.py b/vectordb_bench/backend/clients/mssql/config.py new file mode 100644 index 000000000..7e2321138 --- /dev/null +++ b/vectordb_bench/backend/clients/mssql/config.py @@ -0,0 +1,102 @@ +import pyodbc +import struct +import logging +from azure.identity import ManagedIdentityCredential +from pydantic import BaseModel, SecretStr +from typing import Optional +from ..api import DBConfig, DBCaseConfig, MetricType + +log = logging.getLogger(__name__) + +MSSQL_CONNECTION_STRING_PLACEHOLDER="DRIVER={ODBC Driver 18 for SQL Server};SERVER=%s;DATABASE=%s;UID=%s;PWD=%s;LongAsMax=yes;Connect Timeout=30;TrustServerCertificate=Yes" + +# --- Constants for Token Authentication --- +SQL_COPT_SS_ACCESS_TOKEN = 1256 +SQL_SERVER_TOKEN_SCOPE = "https://database.windows.net/.default" + +# --- Your Modified MSSQLConfig Class --- + +class MSSQLConfig(DBConfig): + server: str + database: str + uid: Optional[str] = None + pwd: Optional[SecretStr] = None + entraid: Optional[str] = None + + def to_dict(self) -> dict: + """ + Prepares connection parameters. If entraid is provided, it fetches a token + manually and returns connection attributes for pyodbc. + """ + # --- Case 1: Standard SQL Authentication --- + if self.entraid is None: + if not self.uid or not self.pwd: + log.error("UID and PWD must be provided for standard SQL auth.") + + pwd_str = self.pwd.get_secret_value() + connection_string = ( + f"DRIVER={{ODBC Driver 18 for SQL Server}};" + f"SERVER={self.server};" + f"DATABASE={self.database};" + f"UID={self.uid};" + f"PWD={pwd_str};" + "LongAsMax=yes;" + "Connect Timeout=30;" + "Encrypt=yes;" + "TrustServerCertificate=Yes" + ) + return {"connection_string": connection_string} + + # --- Case 2: Entra ID Managed Identity (Manual Token Auth) --- + log.info(f"Attempting to get token for User-Assigned Identity: {self.entraid}") + + # 1. Get credentials and token using azure-identity + credential = ManagedIdentityCredential(client_id=self.entraid) + access_token = credential.get_token(SQL_SERVER_TOKEN_SCOPE) + token_bytes = access_token.token.encode("UTF-16-LE") + + # 2. Pack the token for the driver + token_struct = struct.pack(f' str: + if self.metric_type == MetricType.L2: + return "euclidean" + elif self.metric_type == MetricType.IP: + return "dot" + return "cosine" + + def index_param(self) -> dict: + return { + "lists" : self.lists, + "metric" : self.parse_metric() + } + + def search_param(self) -> dict: + return { + "efSearch" : self.efSearch, + "metric" : self.parse_metric() + } diff --git a/vectordb_bench/backend/clients/mssql/mssql.py b/vectordb_bench/backend/clients/mssql/mssql.py new file mode 100644 index 000000000..53dee7f29 --- /dev/null +++ b/vectordb_bench/backend/clients/mssql/mssql.py @@ -0,0 +1,224 @@ +"""Wrapper around MSSQL""" + +import logging +from contextlib import contextmanager +from typing import Any, Generator, Optional, Tuple, Sequence + +from ..api import VectorDB, DBCaseConfig + +import pyodbc +import json + +log = logging.getLogger(__name__) + +class MSSQL(VectorDB): + def __init__( + self, + dim: int, + db_config: dict, + db_case_config: DBCaseConfig, + collection_name: str = "vector", + drop_old: bool = False, + **kwargs, + ): + self.db_config = db_config + self.case_config = db_case_config + self.table_name = collection_name + "_" + str(dim) + self.dim = dim + self.schema_name = "benchmark" + self.drop_old = drop_old + + log.info("db_case_config: " + str(db_case_config)) + + log.info(f"Connecting to MSSQL...") + #log.info(self.db_config['connection_string']) + cnxn = pyodbc.connect( + self.db_config.get("connection_string"), + attrs_before=self.db_config.get("attrs_before") + ) + cursor = cnxn.cursor() + + log.info(f"Creating schema...") + cursor.execute(f""" + if (schema_id('{self.schema_name}') is null) begin + exec('create schema [{self.schema_name}] authorization [dbo];') + end; + """) + cnxn.commit() + + if drop_old: + log.info(f"Dropping existing table... drop table if exists [{self.schema_name}].[{self.table_name}] ") + cursor.execute(f""" + drop table if exists [{self.schema_name}].[{self.table_name}] + """) + cnxn.commit() + + + log.info(f"Creating vector table...") + cursor.execute(f""" + if object_id('[{self.schema_name}].[{self.table_name}]') is null begin + create table [{self.schema_name}].[{self.table_name}] ( + id int not null primary key clustered, + [vector] vector({self.dim}) not null + ) + end + """) + cnxn.commit() + + log.info(f"Dropping old loading vector table type and stored procedure") + cursor.execute(f""" + drop procedure if exists stp_load_vectors + drop type if exists dbo.vector_payload + """) + cnxn.commit() + + log.info(f"Creating table type...") + cursor.execute(f""" + if type_id('dbo.vector_payload') is null begin + create type dbo.vector_payload as table + ( + id int not null, + [vector] vector({self.dim}) not null + ) + end + """) + cursor.commit() + + log.info(f"Creating stored procedure...") + cursor.execute(f""" + create or alter procedure dbo.stp_load_vectors + @dummy int, + @payload dbo.vector_payload readonly + as + begin + set nocount on + insert into [{self.schema_name}].[{self.table_name}] (id, [vector]) select id, [vector] from @payload; + end + """) + cnxn.commit() + + cursor.close() + cnxn.close() + + @contextmanager + def init(self) -> Generator[None, None, None]: + cnxn = pyodbc.connect( + self.db_config.get("connection_string"), + attrs_before=self.db_config.get("attrs_before") + ) + self.cnxn = cnxn + cnxn.autocommit = True + self.cursor = cnxn.cursor() + try: + yield + finally: + self.cursor.close() + self.cnxn.close() + self.cursor = None + self.cnxn = None + + def ready_to_load(self): + log.info(f"MSSQL ready to load") + pass + + def optimize(self): + log.info(f"MSSQL optimize") + search_param = self.case_config.search_param() + metric_function = search_param["metric"] + cursor = self.cursor + if self.drop_old: + cursor.execute(f""" + if exists(select * from sys.indexes where object_id = object_id('[{self.schema_name}].[{self.table_name}]') and type=8) + begin + drop index vec_idx on [{self.schema_name}].[{self.table_name}]; + end + """, + ) + + cursor.execute(f""" + create vector index vec_idx on [{self.schema_name}].[{self.table_name}]([vector]) with (metric = '{metric_function}', type = 'DiskANN'); + """ + ) + + def ready_to_search(self): + log.info(f"MSSQL ready to search") + pass + + def insert_embeddings( + self, + embeddings: list[list[float]], + metadata: list[int], + **kwargs: Any, + ) -> Tuple[int, Optional[Exception]]: + try: + log.info(f'Loading batch of {len(metadata)} vectors...') + #return len(metadata), None + + log.info(f'Generating param list...') + params = [(metadata[i], json.dumps(embeddings[i])) for i in range(len(metadata))] + + log.info(f'Loading table...') + cursor = self.cursor + cursor.execute("EXEC dbo.stp_load_vectors @dummy=?, @payload=?", (1, params)) + return len(metadata), None + except Exception as e: + #cursor.rollback() + log.warning(f"Failed to insert data into vector table ([{self.schema_name}].[{self.table_name}]), error: {e}") + return 0, e + + def search_embedding( + self, + query: list[float], + k: int = 100, + filters: dict | None = None, + timeout: int | None = None, + ) -> list[int]: + search_param = self.case_config.search_param() + metric_function = search_param["metric"] + #efSearch = search_param["efSearch"] + cursor = self.cursor + if filters: + # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v where v.id >= ? order by vector_distance(?, cast(? as varchar({self.dim})), v.[vector]) + cursor.execute(f""" + select + t.id + from + vector_search( + table = [{self.schema_name}].[{self.table_name}] AS t, + column = [vector], + similar_to = ?, + metric = '{metric_function}', + top_n = ? + ) AS s + where + v.id >= ? + """, + json.dumps(query), + k, + int(filters.get('id')), + ) + else: + # select top(?) v.id from [{self.schema_name}].[{self.table_name}] v order by vector_distance(?, cast(? as vector({self.dim})), v.[vector]) + cursor.execute(f""" + declare @v vector({self.dim}) = ?; + select + t.id + from + vector_search( + table = [{self.schema_name}].[{self.table_name}] AS t, + column = [vector], + similar_to = @v, + metric = '{metric_function}', + top_n = ? + ) AS s + order by + t.id + """, + json.dumps(query), + k, + ) + rows = cursor.fetchall() + res = [row.id for row in rows] + return res + + diff --git a/vectordb_bench/backend/runner/mp_runner.py b/vectordb_bench/backend/runner/mp_runner.py index 5b69b5481..81c7ef01a 100644 --- a/vectordb_bench/backend/runner/mp_runner.py +++ b/vectordb_bench/backend/runner/mp_runner.py @@ -79,14 +79,12 @@ def search( if count % 500 == 0: log.debug( - f"({mp.current_process().name:16}) ", - f"search_count: {count}, latest_latency={time.perf_counter()-s}", + f"({mp.current_process().name:16}) search_count: {count}, latest_latency={time.perf_counter()-s}", ) total_dur = round(time.perf_counter() - start_time, 4) log.info( - f"{mp.current_process().name:16} search {self.duration}s: " - f"actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}", + f"{mp.current_process().name:16} search {self.duration}s: actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}", ) return (count, total_dur, latencies) diff --git a/vectordb_bench/backend/runner/serial_runner.py b/vectordb_bench/backend/runner/serial_runner.py index 7eb59432b..63c17e7a8 100644 --- a/vectordb_bench/backend/runner/serial_runner.py +++ b/vectordb_bench/backend/runner/serial_runner.py @@ -71,8 +71,7 @@ def task(self) -> int: ) log.info( - f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, ", - f"dur={time.perf_counter()-start}", + f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, dur={time.perf_counter()-start}", ) return count @@ -251,6 +250,9 @@ def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]: avg_ndcg = round(np.mean(ndcgs), 4) cost = round(np.sum(latencies), 4) p99 = round(np.percentile(latencies, 99), 4) + p95 = round(np.percentile(latencies, 95), 4) + p50 = round(np.percentile(latencies, 50), 4) + log.info( f"{mp.current_process().name:14} search entire test_data: " f"cost={cost}s, " @@ -258,9 +260,10 @@ def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]: f"avg_recall={avg_recall}, " f"avg_ndcg={avg_ndcg}," f"avg_latency={avg_latency}, " - f"p99={p99}", + f"p99={p99}, p95={p95}, p50={p50}", ) - return (avg_recall, avg_ndcg, p99) + log.info(p95) + return (avg_recall, avg_ndcg, p99, p95, p50) def _run_in_subprocess(self) -> tuple[float, float]: with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: diff --git a/vectordb_bench/backend/task_runner.py b/vectordb_bench/backend/task_runner.py index e24d74f03..4af2169b9 100644 --- a/vectordb_bench/backend/task_runner.py +++ b/vectordb_bench/backend/task_runner.py @@ -196,7 +196,7 @@ def _run_perf_case(self, drop_old: bool = True) -> Metric: m.recall = search_results.recall m.serial_latencies = search_results.serial_latencies """ - m.recall, m.ndcg, m.serial_latency_p99 = search_results + m.recall, m.ndcg, m.serial_latency_p99, m.serial_latency_p95, m.serial_latency_p50 = search_results except Exception as e: log.warning(f"Failed to run performance case, reason = {e}") diff --git a/vectordb_bench/cli/vectordbbench.py b/vectordb_bench/cli/vectordbbench.py index 5e3798691..cadbaa04b 100644 --- a/vectordb_bench/cli/vectordbbench.py +++ b/vectordb_bench/cli/vectordbbench.py @@ -1,6 +1,5 @@ from ..backend.clients.alloydb.cli import AlloyDBScaNN from ..backend.clients.aws_opensearch.cli import AWSOpenSearch -from ..backend.clients.memorydb.cli import MemoryDB from ..backend.clients.milvus.cli import MilvusAutoIndex from ..backend.clients.pgdiskann.cli import PgDiskAnn from ..backend.clients.pgvecto_rs.cli import PgVectoRSHNSW, PgVectoRSIVFFlat @@ -10,13 +9,13 @@ from ..backend.clients.test.cli import Test from ..backend.clients.weaviate_cloud.cli import Weaviate from ..backend.clients.zilliz_cloud.cli import ZillizAutoIndex +from ..backend.clients.mssql.cli import MSSQL from .cli import cli cli.add_command(PgVectorHNSW) cli.add_command(PgVectoRSHNSW) cli.add_command(PgVectoRSIVFFlat) cli.add_command(Redis) -cli.add_command(MemoryDB) cli.add_command(Weaviate) cli.add_command(Test) cli.add_command(ZillizAutoIndex) @@ -25,6 +24,7 @@ cli.add_command(PgVectorScaleDiskAnn) cli.add_command(PgDiskAnn) cli.add_command(AlloyDBScaNN) +cli.add_command(MSSQL) if __name__ == "__main__": diff --git a/vectordb_bench/metric.py b/vectordb_bench/metric.py index e0b6cff0e..ae994925f 100644 --- a/vectordb_bench/metric.py +++ b/vectordb_bench/metric.py @@ -17,6 +17,8 @@ class Metric: load_duration: float = 0.0 # duration to load all dataset into DB qps: float = 0.0 serial_latency_p99: float = 0.0 + serial_latency_p95: float = 0.0 + serial_latency_p50: float = 0.0 recall: float = 0.0 ndcg: float = 0.0 conc_num_list: list[int] = field(default_factory=list) diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 49bb04ae0..7a0ef76d7 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -295,6 +295,8 @@ def append_return(x: any, y: any): max_load_dur, max_qps, 15, + 15, + 15, max_recall, 14, 5, @@ -302,7 +304,7 @@ def append_return(x: any, y: any): DATA_FORMAT = ( f"%-{max_db}s | %-{max_db_labels}s %-{max_case}s %-{len(self.task_label)}s" - f" | %-{max_load_dur}s %-{max_qps}s %-15s %-{max_recall}s %-14s" + f" | %-{max_load_dur}s %-{max_qps}s %-15s %-15s %-15s %-{max_recall}s %-14s" f" | %-5s" ) @@ -314,6 +316,8 @@ def append_return(x: any, y: any): "load_dur", "qps", "latency(p99)", + "latency(p95)", + "latency(p50)", "recall", "max_load_count", "label", @@ -336,6 +340,8 @@ def append_return(x: any, y: any): f.metrics.load_duration, f.metrics.qps, f.metrics.serial_latency_p99, + f.metrics.serial_latency_p95, + f.metrics.serial_latency_p50, f.metrics.recall, f.metrics.max_load_count, f.label.value,