From 64c55522fd7ecc5044f302d0f8b7ac72bad598f3 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Tue, 24 Feb 2026 22:10:59 -0300 Subject: [PATCH] fix csv file reader (#2248) * fix * fix test with inline content * fix format --- .../patch-20260225001919068435.json | 4 ++++ packages/graphrag-input/graphrag_input/csv.py | 3 ++- tests/unit/indexing/input/test_csv_loader.py | 23 +++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 .semversioner/next-release/patch-20260225001919068435.json diff --git a/.semversioner/next-release/patch-20260225001919068435.json b/.semversioner/next-release/patch-20260225001919068435.json new file mode 100644 index 000000000..01df0b46f --- /dev/null +++ b/.semversioner/next-release/patch-20260225001919068435.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "fix csv reader" +} diff --git a/packages/graphrag-input/graphrag_input/csv.py b/packages/graphrag-input/graphrag_input/csv.py index e041bff27..9f7ddaec5 100644 --- a/packages/graphrag-input/graphrag_input/csv.py +++ b/packages/graphrag-input/graphrag_input/csv.py @@ -4,6 +4,7 @@ """A module containing 'CSVFileReader' model.""" import csv +import io import logging import sys @@ -39,6 +40,6 @@ async def read_file(self, path: str) -> list[TextDocument]: """ file = await self._storage.get(path, encoding=self._encoding) - reader = csv.DictReader(file.splitlines()) + reader = csv.DictReader(io.StringIO(file)) rows = list(reader) return await self.process_data_columns(rows, path) diff --git a/tests/unit/indexing/input/test_csv_loader.py b/tests/unit/indexing/input/test_csv_loader.py index 1a84d8267..a2cd17175 100644 --- a/tests/unit/indexing/input/test_csv_loader.py +++ b/tests/unit/indexing/input/test_csv_loader.py @@ -54,3 +54,26 @@ async def test_csv_loader_multiple_files(): reader = create_input_reader(config, storage) documents = await reader.read_files() assert len(documents) == 4 + + +async def test_csv_loader_preserves_multiline_fields(tmp_path): + """Multiline quoted CSV fields must retain their internal newlines.""" + csv_content = ( + "title,text\r\n" + '"Post 1","Line one.\nLine two.\nLine three."\r\n' + '"Post 2","Single line."\r\n' + ) + (tmp_path / "input.csv").write_text(csv_content, encoding="utf-8") + config = InputConfig( + type=InputType.Csv, + text_column="text", + title_column="title", + ) + storage = create_storage(StorageConfig(base_dir=str(tmp_path))) + reader = create_input_reader(config, storage) + documents = await reader.read_files() + assert len(documents) == 2 + assert documents[0].title == "Post 1" + assert documents[0].text == "Line one.\nLine two.\nLine three." + assert documents[1].title == "Post 2" + assert documents[1].text == "Single line."