From 661f5846bf676e6f530594c81df14d50c2a5c011 Mon Sep 17 00:00:00 2001 From: Void Freud Date: Fri, 27 Mar 2026 21:33:22 +0200 Subject: [PATCH 1/2] Use UTC timestamps, configurable analysis dates, and add CI/publish workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch format_timestamp and date_group_key to UTC for reproducible output - Make analysis date format configurable via AnalyzeConfig.time_format - Add GitHub Actions CI workflow (test + lint across Python 3.10–3.13) - Add GitHub Actions publish workflow (trusted PyPI publishing on release) Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 36 +++++++++++++++++++ .github/workflows/publish.yml | 28 +++++++++++++++ chatgpt_export_tool/commands/analyze.py | 11 ++++-- .../core/analysis_formatter.py | 8 +++-- chatgpt_export_tool/core/output/paths.py | 4 ++- chatgpt_export_tool/core/transcript/access.py | 6 ++-- chatgpt_export_tool/core/utils.py | 10 +++--- 7 files changed, 90 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3c0a2ed --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,36 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Install dependencies + run: uv sync --group dev + + - name: Lint + run: uv run ruff check chatgpt_export_tool tests + + - name: Format check + run: uv run ruff format --check chatgpt_export_tool tests + + - name: Test + run: uv run pytest --cov=chatgpt_export_tool --cov-report=term-missing diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..d7becf6 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,28 @@ +name: Publish to PyPI + +on: + release: + types: [published] + +permissions: + id-token: write + +jobs: + publish: + runs-on: ubuntu-latest + environment: pypi + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install build tools + run: pip install build + + - name: Build package + run: python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/chatgpt_export_tool/commands/analyze.py b/chatgpt_export_tool/commands/analyze.py index 28a9590..b2d429c 100644 --- a/chatgpt_export_tool/commands/analyze.py +++ b/chatgpt_export_tool/commands/analyze.py @@ -1,7 +1,7 @@ """Analyze command for chatgpt_export_tool.""" import argparse -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Optional @@ -11,6 +11,7 @@ AnalyzeConfig, format_analysis_text, ) +from chatgpt_export_tool.core.config.models import DEFAULT_TIME_FORMAT from chatgpt_export_tool.core.file_utils import get_file_size from chatgpt_export_tool.core.parser import JSONParser from chatgpt_export_tool.core.utils import format_size @@ -54,11 +55,15 @@ def _execute(self) -> None: results = parser.analyze(verbose=self.logger.level <= 20) results["file_size"] = format_size(file_size) results["filepath"] = self.filepath - results["analysis_date"] = datetime.now().strftime("%H:%M %d-%m-%Y") + time_format = DEFAULT_TIME_FORMAT + results["analysis_date"] = datetime.now(tz=timezone.utc).strftime(time_format) output = format_analysis_text( results, - AnalyzeConfig(include_fields=self.include_fields), + AnalyzeConfig( + include_fields=self.include_fields, + time_format=time_format, + ), ) if self.output_file: diff --git a/chatgpt_export_tool/core/analysis_formatter.py b/chatgpt_export_tool/core/analysis_formatter.py index 86453eb..5dfd8e4 100644 --- a/chatgpt_export_tool/core/analysis_formatter.py +++ b/chatgpt_export_tool/core/analysis_formatter.py @@ -16,9 +16,11 @@ class AnalyzeConfig: Attributes: include_fields: Whether to include field coverage details. + time_format: strftime format for date/time display. """ include_fields: bool = False + time_format: str = "%H:%M %d-%m-%Y" def format_analysis_text( @@ -56,8 +58,10 @@ def format_analysis_text( lines.append(f"Total message nodes in mappings: {results['message_count']:,}") if results.get("min_date") is not None and results.get("max_date") is not None: - lines.append(f"From: {format_timestamp(results['min_date'])}") - lines.append(f"To: {format_timestamp(results['max_date'])}") + lines.append( + f"From: {format_timestamp(results['min_date'], config.time_format)}" + ) + lines.append(f"To: {format_timestamp(results['max_date'], config.time_format)}") lines.append("") diff --git a/chatgpt_export_tool/core/output/paths.py b/chatgpt_export_tool/core/output/paths.py index 65e7a0b..1fb5e21 100644 --- a/chatgpt_export_tool/core/output/paths.py +++ b/chatgpt_export_tool/core/output/paths.py @@ -94,4 +94,6 @@ def get_unique_filepath( candidate = filepath.with_name(f"{filepath.stem}_{suffix}{filepath.suffix}") if candidate not in used_paths and not candidate.exists(): return candidate - raise RuntimeError(f"Could not find unique path for {filepath} after 10000 attempts") + raise RuntimeError( + f"Could not find unique path for {filepath} after 10000 attempts" + ) diff --git a/chatgpt_export_tool/core/transcript/access.py b/chatgpt_export_tool/core/transcript/access.py index 4b117ab..f375955 100644 --- a/chatgpt_export_tool/core/transcript/access.py +++ b/chatgpt_export_tool/core/transcript/access.py @@ -1,6 +1,6 @@ """Shared read-only helpers for conversation structures.""" -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Iterator, Optional from .thread import ( @@ -86,7 +86,9 @@ def get_date_group_key(conversation: dict[str, Any]) -> Optional[str]: return None try: - return datetime.fromtimestamp(float(create_time)).strftime("%Y-%m-%d") + return datetime.fromtimestamp(float(create_time), tz=timezone.utc).strftime( + "%Y-%m-%d" + ) except (TypeError, ValueError, OSError): return None diff --git a/chatgpt_export_tool/core/utils.py b/chatgpt_export_tool/core/utils.py index 8240089..fd069d0 100644 --- a/chatgpt_export_tool/core/utils.py +++ b/chatgpt_export_tool/core/utils.py @@ -1,6 +1,6 @@ """Small shared formatting helpers.""" -from datetime import datetime +from datetime import datetime, timezone def format_size(size_bytes: int) -> str: @@ -21,14 +21,14 @@ def format_size(size_bytes: int) -> str: def format_timestamp(timestamp: float, time_format: str = "%H:%M %d-%m-%Y") -> str: - """Format a Unix timestamp to human-readable date string. + """Format a Unix timestamp to a UTC date string. Args: timestamp: Unix timestamp (seconds since epoch). May be float, int, or Decimal. + time_format: strftime format string. Returns: - Formatted date string. + Formatted UTC date string. """ - # Handle Decimal values from JSON parser - dt = datetime.fromtimestamp(float(timestamp)) + dt = datetime.fromtimestamp(float(timestamp), tz=timezone.utc) return dt.strftime(time_format) From 7016eb3294f0540ecf44ee2a97ea7555a074f2c1 Mon Sep 17 00:00:00 2001 From: Void Freud Date: Sat, 28 Mar 2026 00:27:34 +0200 Subject: [PATCH 2/2] Default to Markdown output format with proper md syntax - Add md as a first-class format (default), alongside txt and json - Markdown output uses # headings, **bold** metadata, > blockquoted context, --- separators, and no indentation - get_formatter auto-applies markdown heading style for md format - txt format still works unchanged via --format txt - Rewrite README and Fields.md with polished, professional styling Co-Authored-By: Claude Opus 4.6 --- Fields.md | 233 ++++------- README.md | 362 +++++++++--------- chatgpt_export.toml.example | 4 +- chatgpt_export_tool/commands/export.py | 6 +- chatgpt_export_tool/core/config/models.py | 2 +- chatgpt_export_tool/core/config/validation.py | 2 +- chatgpt_export_tool/core/export_service.py | 2 +- chatgpt_export_tool/core/output/formatters.py | 49 ++- chatgpt_export_tool/core/output/naming.py | 2 +- chatgpt_export_tool/core/output/paths.py | 2 +- chatgpt_export_tool/core/output/writer.py | 2 +- tests/test_output_writer.py | 10 +- tests/test_runtime_config.py | 4 +- tests/test_runtime_contract.py | 8 +- 14 files changed, 338 insertions(+), 350 deletions(-) diff --git a/Fields.md b/Fields.md index 1e0caa2..67456fe 100644 --- a/Fields.md +++ b/Fields.md @@ -1,158 +1,109 @@ # Field Selection Reference -This document describes the field-selection and metadata-selection features that the current CLI actually supports. +Practical reference for the `--fields` and `--include`/`--exclude` options in `chatgpt-export`. -It is intentionally practical rather than exhaustive. The goal is to document the fields, groups, and selectors you can use with `chatgpt-export`, not to guess every field that might appear in every historical `conversations.json` file. +--- -## Structural Levels +## How Data Is Structured -The tool understands conversation data at these nested levels: +ChatGPT exports nest data at these levels: -```text +``` conversation └── mapping node └── message - ├── author - ├── content - └── metadata + ├── author (role, name) + ├── content (content_type, parts, text, ...) + └── metadata (model_slug, message_type, ...) ``` -The field selector can retain or remove fields across those levels while preserving the containers needed to reach nested selected fields. - -Text export is transcript-oriented: it follows the active branch defined by `current_node` and `parent` links, then applies transcript visibility rules from the TOML config passed to `export`. - -## `--fields` +The field selector retains or removes fields across these levels while preserving the containers needed to reach any selected nested field. -The `--fields` argument accepts one field-selection spec. +Text/Markdown export is **transcript-oriented** — it follows the active branch via `current_node` and `parent` links, then applies visibility rules from the TOML config. -Supported forms: +--- -```text -all -none -include field1,field2 -exclude field1,field2 -groups group1,group2 -``` +## `--fields` -Examples: +Controls which structural fields survive before formatting. ```bash -chatgpt-export export data.json --fields all -chatgpt-export export data.json --fields none -chatgpt-export export data.json --fields "include title,create_time,mapping" -chatgpt-export export data.json --fields "exclude moderation_results,plugin_ids" -chatgpt-export export data.json --fields "groups minimal" -chatgpt-export export data.json --fields "groups conversation,message" +--fields all # keep everything (default) +--fields none # empty structure +--fields "include title,create_time,mapping" # whitelist +--fields "exclude moderation_results" # blacklist +--fields "groups minimal" # named group +--fields "groups conversation,message" # combine groups ``` -Multi-word specs must be quoted. +> Multi-word specs must be quoted. -## Field Groups +--- -The current built-in field groups are: +## Built-in Field Groups ### `conversation` -Includes: - -- `_id` -- `conversation_id` -- `create_time` -- `update_time` -- `title` -- `type` +`_id` · `conversation_id` · `create_time` · `update_time` · `title` · `type` ### `message` -Includes: - -- `author` -- `content` -- `status` -- `end_turn` +`author` · `content` · `status` · `end_turn` ### `metadata` -Includes: - -- `model_slug` -- `message_type` -- `is_archived` +`model_slug` · `message_type` · `is_archived` ### `minimal` -Includes: +`title` · `create_time` · `message` -- `title` -- `create_time` -- `message` +--- -## Known Structural Fields +## All Known Structural Fields -These are the structural fields the tool currently categorizes by level. +
+Conversation level -### Conversation +`title` · `create_time` · `update_time` · `mapping` · `moderation_results` · `current_node` · `plugin_ids` · `_id` · `conversation_id` · `type` -- `title` -- `create_time` -- `update_time` -- `mapping` -- `moderation_results` -- `current_node` -- `plugin_ids` -- `_id` -- `conversation_id` -- `type` +
-### Mapping Node +
+Mapping node level -- `id` -- `parent` -- `children` -- `message` +`id` · `parent` · `children` · `message` -### Message +
-- `author` -- `content` -- `status` -- `end_turn` -- `weight` -- `recipient` -- `channel` -- `create_time` -- `update_time` +
+Message level -### Author +`author` · `content` · `status` · `end_turn` · `weight` · `recipient` · `channel` · `create_time` · `update_time` -- `role` -- `name` +
-### Content +
+Author -- `content_type` -- `parts` -- `language` -- `response_format_name` -- `text` -- `user_profile` -- `user_instructions` +`role` · `name` -Unknown names are still allowed in `include` and `exclude` field specs, but the validator may warn about them. +
-## Metadata Filtering +
+Content -Metadata filtering is separate from `--fields`. +`content_type` · `parts` · `language` · `response_format_name` · `text` · `user_profile` · `user_instructions` -Use: +
-- `--include PATTERN [PATTERN ...]` -- `--exclude PATTERN [PATTERN ...]` +> Unknown field names are allowed in `include`/`exclude` specs, but the validator may warn about them. -These apply to known metadata names inside nested `message.metadata` dictionaries after structural field filtering. +--- -Examples: +## Metadata Filtering + +Separate from `--fields`. Applies only to keys inside `message.metadata` dictionaries, *after* structural filtering. ```bash chatgpt-export export data.json --include model_slug @@ -160,73 +111,51 @@ chatgpt-export export data.json --include "model*" --exclude plugin_ids chatgpt-export export data.json --fields "groups message" --include is_archived ``` -Pattern matching supports: - -- exact matches -- substring matches -- shell-style wildcards such as `model*` - -## Known Metadata Names +**Pattern matching:** exact, substring, and shell-style globs (`model*`). -The current metadata filter recognizes these names: +**Known metadata names:** `model_slug` · `message_type` · `plugin_ids` · `is_archived` -- `model_slug` -- `message_type` -- `plugin_ids` -- `is_archived` +--- -## How Filtering Combines +## Filtering Pipeline -Filtering happens in this order: - -1. structural field selection through `--fields` -2. metadata filtering through `--include` and `--exclude` -3. formatting to text or JSON - -This means: +``` +conversations.json + → 1. Structural field selection (--fields) + → 2. Metadata filtering (--include / --exclude) + → 3. Formatting (md / txt / json) + → output +``` -- `--fields` decides whether structural containers like `mapping`, `message`, `author`, `content`, and `metadata` survive -- `--include` and `--exclude` decide which metadata keys remain inside metadata dictionaries +`--fields` decides whether containers like `mapping`, `message`, and `metadata` survive. +`--include`/`--exclude` decides which keys remain inside those metadata containers. -## Practical Recipes +--- -Keep only a small readable subset: +## Recipes ```bash +# Minimal readable export chatgpt-export export data.json --fields "groups minimal" -``` - -Keep titles and timestamps but drop plugin noise: -```bash +# Drop noise, keep structure chatgpt-export export data.json --fields "exclude plugin_ids,moderation_results" -``` - -Keep only message-oriented structure and model metadata: -```bash +# Message structure + model info only chatgpt-export export data.json --fields "groups message" --include "model*" -``` -Write one file per conversation with a minimal payload: - -```bash +# One file per conversation, minimal payload chatgpt-export export data.json --split subject --output-dir exports --fields "groups minimal" ``` +--- + ## Notes -- `analyze --fields` reports field coverage; it does not accept the export-style field-selection spec. -- `export` can load defaults from a TOML file via `--config PATH`. -- The repo ships `chatgpt_export.toml.example` as a template; copy it to a local file before use. -- `export --split single` writes to stdout unless `--output` is provided. -- Subject split files are named from the source conversation title plus identifier. -- Split modes such as `subject`, `date`, and `id` write to `--output-dir`. -- Text export follows the active conversation branch and is configurable through the `[transcript]` and `[text_output]` TOML sections. -- Default text export shows user text, assistant text, assistant thoughts, and a compact preview of `user_editable_context`. -- Default text export hides assistant code, reasoning recap, and tool plumbing unless the transcript policy explicitly enables them. -- Advanced transcript controls include `user_editable_context_mode`, `show_visually_hidden_content_types`, `include_content_types`, and `exclude_content_types`. -- Text layout controls include `layout_mode`, `heading_style`, `include_turn_count_in_header`, `include_turn_numbers`, `turn_separator`, `strip_chatgpt_artifacts`, and `wrap_width`. -- A practical default is `layout_mode = "reading"` with `turn_separator = "---"` and artifact stripping enabled. -- For tighter exports, use `layout_mode = "compact"` and disable turn counts. -- For notes-oriented output, use `heading_style = "markdown"`. +- `analyze --fields` reports field *coverage* — it does not use the export-style field-selection spec. +- `export --split single` writes to stdout unless `--output` is given. +- Subject split files are named `Title_ID.md` (or `.txt`/`.json`). +- Default Markdown export shows user text, assistant text, thoughts, and compact context previews. +- Hidden by default: assistant code, reasoning recap, tool plumbing. +- All transcript visibility is configurable via `[transcript]` in the TOML config. +- Text layout is configurable via `[text_output]`: layout mode, heading style, wrap width, separators, turn numbering. diff --git a/README.md b/README.md index d1e1820..510128e 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,27 @@ -# ChatGPT Export Tool +

+

ChatGPT Export Tool

+

+ Stream, analyze, and export your ChatGPT history — without loading it all into memory. +

+

+ Python 3.10+ + MIT License + Ruff + uv +

+

-[![Python Version](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/) -[![License](https://img.shields.io/badge/license-MIT-green)](LICENSE) -[![Code Style](https://img.shields.io/badge/code%20style-ruff-cyan)](https://github.com/astral-sh/ruff) +
-A CLI for analyzing and exporting ChatGPT `conversations.json` files. +A Python CLI that takes your `conversations.json` export from ChatGPT and turns it into clean, readable **Markdown**, plain text, or structured JSON — with full control over what gets included. -The project focuses on two things: +It uses **streaming JSON parsing** ([ijson](https://github.com/ICRAR/ijson)) so even multi-hundred-megabyte exports never need to be loaded into memory. Filtering, formatting, and output are all modular and independently configurable through CLI flags or a single TOML config file. -- analyzing the structure of a ChatGPT export without loading the whole file into memory -- exporting conversations to text or JSON with structural field filtering and metadata filtering - -It uses streaming JSON parsing with `ijson` and is organized around small core modules so filtering, formatting, split behavior, and path generation can be changed independently. - -Persistent defaults can be stored in a single TOML config file. The repo ships a template at `chatgpt_export.toml.example`. +--- ## Installation -This project currently targets Python `3.10+`. +> **Requires Python `3.10+`**  ·  Managed with [uv](https://docs.astral.sh/uv/) ```bash git clone https://github.com/voidfreud/chatgpt-export-tool.git @@ -25,67 +29,51 @@ cd chatgpt-export-tool uv sync ``` -For development tooling too: +For dev tooling (pytest, ruff, coverage): ```bash uv sync --group dev ``` -You can then run the CLI with: +Verify the install: ```bash uv run chatgpt-export --help ``` -To apply config defaults, copy the template and pass `--config PATH`. +--- ## Quick Start -Analyze an export: - -```bash -uv run chatgpt-export analyze path/to/conversations.json -``` - -Include field coverage: - ```bash -uv run chatgpt-export analyze path/to/conversations.json --fields -``` +# Analyze your export — stats without loading the whole file +uv run chatgpt-export analyze conversations.json -Export everything as text to stdout: +# Export everything to markdown (default) on stdout +uv run chatgpt-export export conversations.json -```bash -uv run chatgpt-export export path/to/conversations.json -``` +# Export to a single file +uv run chatgpt-export export conversations.json --output all_chats.md -Export everything as JSON to one file: +# One markdown file per conversation +uv run chatgpt-export export conversations.json --split subject --output-dir exports/ -```bash -uv run chatgpt-export export path/to/conversations.json --format json --output conversations.json +# JSON export +uv run chatgpt-export export conversations.json --format json --output dump.json ``` -Export one file per conversation: - -```bash -uv run chatgpt-export export path/to/conversations.json --split subject --output-dir exports -``` +--- ## Commands ### `analyze` -`analyze` reports high-level structure and statistics for a `conversations.json` file. - -It includes: - -- conversation count -- message count -- file size -- date range -- optional field coverage with `--fields` +Reports structure and statistics for a `conversations.json` file without writing any output files. -Examples: +| What it shows | Flag | +|---|---| +| Conversation & message counts, file size, date range | *(default)* | +| Field coverage per structural level | `--fields` | ```bash uv run chatgpt-export analyze data.json @@ -96,156 +84,188 @@ uv run chatgpt-export analyze data.json --debug ### `export` -`export` writes conversations in either text or JSON format. +Converts conversations into **Markdown** (default), **plain text**, or **JSON** with fine-grained control over structure, metadata, and output layout. -It supports: +```bash +# Minimal readable export +uv run chatgpt-export export data.json --fields "groups minimal" -- structural field filtering through `--fields` -- metadata filtering through `--include` and `--exclude` -- transcript-oriented text export that follows the active branch -- split modes for one output, one file per conversation, date folders, or ID-based files +# Full JSON, one file per conversation by date +uv run chatgpt-export export data.json --format json --split date --output-dir by-date/ -Examples: +# Selective metadata +uv run chatgpt-export export data.json --include "model*" --exclude plugin_ids -```bash -uv run chatgpt-export export data.json -uv run chatgpt-export export data.json --output conversations.txt -uv run chatgpt-export export data.json --format json --output conversations.json -uv run chatgpt-export export data.json --split subject --output-dir exports -uv run chatgpt-export export data.json --fields "groups minimal" --split subject --output-dir exports -uv run chatgpt-export export data.json --fields "include title,mapping" --include "model*" --exclude plugin_ids +# Use a config file for persistent defaults cp chatgpt_export.toml.example chatgpt_export.toml uv run chatgpt-export export data.json --config chatgpt_export.toml ``` -## Field Filtering +--- -The `--fields` option controls which structural fields are retained before formatting. +## Output Formats -Supported forms: +| Format | Flag | Extension | Description | +|---|---|---|---| +| **Markdown** | `--format md` | `.md` | Transcript-oriented with `#` headings, `>` blockquoted context, `---` separators. **Default.** | +| **Plain text** | `--format txt` | `.txt` | Indented text with plain labels — good for terminals and grep. | +| **JSON** | `--format json` | `.json` | Filtered conversation objects written as valid JSON. | -- `all` -- `none` -- `include field1,field2` -- `exclude field1,field2` -- `groups group1,group2` +Markdown and text exports follow the **active conversation branch** using `current_node` and `parent` links, so you see the conversation as it actually played out — not the full tree with all edits and branches. -Examples: +
+What the Markdown output looks like -```bash -uv run chatgpt-export export data.json --fields all -uv run chatgpt-export export data.json --fields none -uv run chatgpt-export export data.json --fields "include title,create_time,mapping" -uv run chatgpt-export export data.json --fields "exclude moderation_results,plugin_ids" -uv run chatgpt-export export data.json --fields "groups minimal" +```markdown +# Opening bank account in Thailand +**ID:** 68ed8eba-1d00-832a-a2f1-4721496ed217 +**Created:** 23:43 13-10-2025 +**Turns:** 34 + +## Conversation Context +> User profile: The user provided the following information... +> User instructions: Speaking to Nova. Zero restraint... + +## User [23:43 13-10-2025] +Can a foreigner open a Thailand bank account? + +## Assistant [23:44 13-10-2025] +Yup — a foreigner *can* open a bank account in Thailand, but it's +*much more difficult* now than it used to be... ``` -Available field groups: +
+ +Default transcript behavior: -- `conversation` -- `message` -- `metadata` -- `minimal` +- **Shown:** user text, assistant text, assistant thoughts, user editable context (compact preview) +- **Hidden:** tool plumbing, assistant code, reasoning recap, blank nodes -See [Fields.md](Fields.md) for the current field-selection reference. +All of this is configurable via the `[transcript]` section in the TOML config. -## Metadata Filtering +--- -The metadata filter runs after structural field filtering and applies only to keys inside nested `message.metadata` dictionaries. +## Filtering -Examples: +### Structural Fields — `--fields` + +Controls which parts of each conversation object are retained before formatting. ```bash -uv run chatgpt-export export data.json --include model_slug -uv run chatgpt-export export data.json --include "model*" --exclude plugin_ids -uv run chatgpt-export export data.json --fields "groups message" --include is_archived +--fields all # everything (default) +--fields none # structure only +--fields "include title,create_time,mapping" # keep only these +--fields "exclude moderation_results" # drop these +--fields "groups minimal" # use a named group ``` -Currently supported metadata names include: +**Built-in groups:** -- `model_slug` -- `message_type` -- `plugin_ids` -- `is_archived` +| Group | Fields | +|---|---| +| `conversation` | `_id`, `conversation_id`, `create_time`, `update_time`, `title`, `type` | +| `message` | `author`, `content`, `status`, `end_turn` | +| `metadata` | `model_slug`, `message_type`, `is_archived` | +| `minimal` | `title`, `create_time`, `message` | -## Split Modes +See [`Fields.md`](Fields.md) for the full field-selection reference. -`export` supports four split modes: +### Metadata — `--include` / `--exclude` -- `single`: one combined output stream or one output file -- `subject`: one file per conversation, named from title plus identifier -- `date`: date folders with one file per conversation -- `id`: one file per conversation, named from conversation ID +Runs *after* structural filtering. Applies only to keys inside nested `message.metadata` dictionaries. -Important output behavior: +```bash +--include model_slug # keep only model_slug +--include "model*" --exclude plugin_ids # glob patterns supported +``` -- `--split single` with no `--output` writes to stdout -- `--split single --output FILE` writes one file -- split modes like `subject`, `date`, and `id` write into `--output-dir` +Known metadata names: `model_slug`, `message_type`, `plugin_ids`, `is_archived`. -## Output Formats +--- -Supported formats: +## Split Modes -- `txt` -- `json` +Control how conversations are distributed across output files. -`txt` is a transcript-oriented export that follows the active branch of the conversation tree. -`json` writes the filtered conversation objects directly. +| Mode | Flag | Behavior | +|---|---|---| +| **Single** | `--split single` | One combined stream or file *(default)* | +| **Subject** | `--split subject` | One file per conversation, named `Title_ID.md` | +| **Date** | `--split date` | Daily folders → one file per conversation | +| **ID** | `--split id` | One file per conversation, named by conversation ID | -By default, text export includes user text, assistant text, assistant thoughts, and user editable context when present. User editable context is rendered in a compact preview by default so transcripts stay readable. Text export hides tool plumbing, assistant code, reasoning recap, and blank/internal nodes unless the transcript policy is changed in config. +```bash +# Stdout (single, no --output) +uv run chatgpt-export export data.json -Text output defaults now favor reading clarity: +# Single file +uv run chatgpt-export export data.json --output all.md -- conversation context is rendered as a separate preamble block -- visible turns are grouped into clearer chat-style `User` / `Assistant` sections -- turn counts can be shown in the header -- ChatGPT citation/navigation artifacts can be stripped from text output -- long paragraphs can be wrapped for easier reading +# Split into a directory +uv run chatgpt-export export data.json --split subject --output-dir exports/ +``` -Important transcript policy options include: +> **Note:** `--output` is for single mode only. Split modes use `--output-dir`. -- `user_editable_context_mode` -- `show_visually_hidden_content_types` -- `include_content_types` -- `exclude_content_types` +--- ## Configuration -`export` accepts `--config PATH` and resolves defaults from one TOML file. +All export behavior can be persisted in a single TOML file. The repo ships [`chatgpt_export.toml.example`](chatgpt_export.toml.example) as a starting point. + +```bash +cp chatgpt_export.toml.example chatgpt_export.toml +# edit to taste, then: +uv run chatgpt-export export data.json --config chatgpt_export.toml +``` + +CLI flags always override TOML values. + +
+TOML sections overview + +**`[defaults]`** — format, split mode, field selection, output directory, metadata filters -The repo ships `chatgpt_export.toml.example` as a template. Copy it to a local file such as `chatgpt_export.toml` and pass that path explicitly. +**`[transcript]`** — branch following, visibility rules per content type -The config file is TOML and is intentionally kept to one file with sections such as: +| Key | Default | What it does | +|---|---|---| +| `show_system_messages` | `false` | Include system prompts | +| `show_tool_messages` | `false` | Include tool/function calls | +| `show_assistant_code` | `false` | Include code execution blocks | +| `show_reasoning_recap` | `false` | Include reasoning summaries | +| `user_editable_context_mode` | `"compact"` | `"compact"` or `"full"` for context rendering | +| `include_content_types` | `[]` | Whitelist specific content types | +| `exclude_content_types` | `[]` | Blacklist specific content types | -- `[defaults]` for format, split mode, field selection, and output directory -- `[transcript]` for active-branch reconstruction and visibility rules -- `[text_output]` for header fields, transcript layout, and date/time formats +**`[text_output]`** — header, layout, formatting for text/markdown output -Notable `[text_output]` options include: +| Key | Default | What it does | +|---|---|---| +| `layout_mode` | `"reading"` | `"reading"` (spacious) or `"compact"` (dense) | +| `heading_style` | `"markdown"` | `"markdown"` (with `#`) or `"plain"` | +| `turn_separator` | `"---"` | Separator between turns | +| `strip_chatgpt_artifacts` | `true` | Remove ChatGPT citation/nav artifacts | +| `wrap_width` | `88` | Line wrap width (`0` to disable) | +| `include_turn_count_in_header` | `true` | Show turn count in header | +| `include_turn_numbers` | `false` | Number each turn | -- `layout_mode = "reading" | "compact"` -- `heading_style = "plain" | "markdown"` -- `include_turn_count_in_header = true | false` -- `include_turn_numbers = true | false` -- `turn_separator = "---"` -- `strip_chatgpt_artifacts = true | false` -- `wrap_width = 88` +
-Practical transcript presets: +
+Config presets -Reading-first transcript: +**Reading-first** *(default)* ```toml [text_output] layout_mode = "reading" -heading_style = "plain" -include_turn_count_in_header = true +heading_style = "markdown" turn_separator = "---" strip_chatgpt_artifacts = true wrap_width = 88 ``` -Compact scanning transcript: +**Compact scanning** ```toml [text_output] layout_mode = "compact" @@ -254,55 +274,55 @@ turn_separator = "" wrap_width = 0 ``` -Markdown/notes transcript: +**Plain text / terminal** ```toml +[defaults] +format = "txt" + [text_output] layout_mode = "reading" -heading_style = "markdown" +heading_style = "plain" turn_separator = "---" ``` -CLI arguments override TOML values. `analyze` does not currently use export config defaults. - -## Architecture +
-The structure is intentionally modular at the subsystem level: +--- -- command wiring and user-facing behavior live in `chatgpt_export_tool/commands/` -- streaming parse and analysis are separate from export formatting and writing -- structural field filtering and metadata filtering are separate concerns -- split-key resolution, filename policy, and writing are isolated from export orchestration +## Architecture -The core package is also grouped into shallow subpackages by concern: +``` +chatgpt_export_tool/ +├── cli.py ← Entry point & argparse +├── commands/ ← analyze, export command wiring +└── core/ + ├── parser.py ← Streaming JSON via ijson + ├── filter_pipeline.py ← Field + metadata filtering + ├── export_service.py ← Orchestration + ├── config/ ← TOML loading, models, validation + ├── transcript/ ← Branch reconstruction, text extraction + ├── validation/ ← Field & metadata validation + └── output/ ← Formatters, writer, path resolution +``` -- `core/config/` for runtime config models, loading, and validation -- `core/transcript/` for branch reconstruction and transcript extraction -- `core/validation/` for field and metadata validation -- `core/output/` for formatting, naming, path resolution, and writing +The design is deliberately modular: filtering, formatting, splitting, and writing are separate concerns. Most changes touch one small file, not a central controller. -That separation is deliberate: most behavior changes can be made in one small subsystem instead of in one large control file. +--- ## Development -Run the checks used during refactoring: - ```bash +# Tests uv run pytest uv run pytest --cov=chatgpt_export_tool --cov-report=term-missing -uv run ruff check chatgpt_export_tool tests pyproject.toml + +# Lint & format +uv run ruff check chatgpt_export_tool tests uv run ruff format --check chatgpt_export_tool tests ``` -If you need to format files: - -```bash -uv run ruff format chatgpt_export_tool tests -``` +--- -## Notes +## License -- Input handling is streaming, so large exports do not need to be loaded into memory just to analyze or iterate conversations. -- Single-file JSON export writes one valid JSON document. -- Split exports write one conversation per output file. -- Text export follows the active thread path using `current_node` and `parent` links. -- The field-selection and metadata-selection surface is documented in [Fields.md](Fields.md). +[MIT](LICENSE) — Void Freud ([@voidfreud](https://github.com/voidfreud)) diff --git a/chatgpt_export.toml.example b/chatgpt_export.toml.example index 11987dd..287d023 100644 --- a/chatgpt_export.toml.example +++ b/chatgpt_export.toml.example @@ -1,5 +1,5 @@ [defaults] -format = "txt" +format = "md" split = "single" fields = "all" output_dir = "output" @@ -32,7 +32,7 @@ header_fields = ["title", "id", "create_time"] conversation_time_format = "%H:%M %d-%m-%Y" turn_time_format = "%H:%M %d-%m-%Y" layout_mode = "reading" -heading_style = "plain" +heading_style = "markdown" include_turn_count_in_header = true include_turn_numbers = false turn_separator = "---" diff --git a/chatgpt_export_tool/commands/export.py b/chatgpt_export_tool/commands/export.py index 8ed65f5..4fc6db3 100644 --- a/chatgpt_export_tool/commands/export.py +++ b/chatgpt_export_tool/commands/export.py @@ -153,7 +153,7 @@ def add_export_parser( "export", help="Extract conversations with field selection and metadata filtering", description=( - "Export ChatGPT conversations to txt or json format.\n\n" + "Export ChatGPT conversations to md, txt, or json format.\n\n" "Use --fields to control which structural fields are retained,\n" "and compose --include/--exclude to filter metadata fields.\n" "Use --split to organize output into directories." @@ -200,9 +200,9 @@ def add_export_parser( export_parser.add_argument( "--format", "-F", - choices=["txt", "json"], + choices=["md", "txt", "json"], default=None, - help="Output format: 'txt' or 'json' (default from config or built-in fallback)", + help="Output format: 'md', 'txt', or 'json' (default: md)", ) export_parser.add_argument( "--fields", diff --git a/chatgpt_export_tool/core/config/models.py b/chatgpt_export_tool/core/config/models.py index 64355f3..fd9db8a 100644 --- a/chatgpt_export_tool/core/config/models.py +++ b/chatgpt_export_tool/core/config/models.py @@ -13,7 +13,7 @@ class DefaultsConfig: """Default CLI/runtime values.""" - format_type: str = "txt" + format_type: str = "md" split_mode: str = "single" field_spec: str = "all" output_dir: str = "output" diff --git a/chatgpt_export_tool/core/config/validation.py b/chatgpt_export_tool/core/config/validation.py index de06bce..bc885a5 100644 --- a/chatgpt_export_tool/core/config/validation.py +++ b/chatgpt_export_tool/core/config/validation.py @@ -9,7 +9,7 @@ def validate_defaults_config(defaults: DefaultsConfig) -> None: """Validate default config values that have constrained semantics.""" - valid_formats = {"txt", "json"} + valid_formats = {"md", "txt", "json"} if defaults.format_type not in valid_formats: raise ValueError( "Config value 'format' must be one of: " + ", ".join(sorted(valid_formats)) diff --git a/chatgpt_export_tool/core/export_service.py b/chatgpt_export_tool/core/export_service.py index 4a400da..1750838 100644 --- a/chatgpt_export_tool/core/export_service.py +++ b/chatgpt_export_tool/core/export_service.py @@ -40,7 +40,7 @@ class ExportConfig: """ filepath: str - format_type: str = "txt" + format_type: str = "md" output_file: Optional[str] = None output_dir: str = "output" split_mode: SplitMode = SplitMode.SINGLE diff --git a/chatgpt_export_tool/core/output/formatters.py b/chatgpt_export_tool/core/output/formatters.py index 2d56540..c1c935c 100644 --- a/chatgpt_export_tool/core/output/formatters.py +++ b/chatgpt_export_tool/core/output/formatters.py @@ -73,6 +73,10 @@ def __init__( indent, ) + @property + def _is_markdown(self) -> bool: + return self.text_output_config.heading_style == "markdown" + def format_conversation(self, conv: dict[str, Any]) -> str: """Format a conversation as text. @@ -82,7 +86,9 @@ def format_conversation(self, conv: dict[str, Any]) -> str: Returns: Formatted conversation text. """ - lines = ["-" * 40] + lines: list[str] = [] + if not self._is_markdown: + lines.append("-" * 40) if self.include_header: lines.extend(self._render_header(conv)) @@ -104,12 +110,18 @@ def format_conversation(self, conv: dict[str, Any]) -> str: and self.text_output_config.include_turn_count_in_header ): header_index = self._find_header_insert_index(lines) - lines.insert(header_index, f"Turns: {len(turn_blocks)}") + turn_label = ( + f"**Turns:** {len(turn_blocks)}" + if self._is_markdown + else f"Turns: {len(turn_blocks)}" + ) + lines.insert(header_index, turn_label) lines.insert(header_index + 1, "") lines.extend(self._render_chat_entries(turn_blocks)) - lines.append("-" * 40) + if not self._is_markdown: + lines.append("-" * 40) return "\n".join(lines) def _find_header_insert_index(self, lines: list[str]) -> int: @@ -135,7 +147,10 @@ def _render_context_entries(self, entries: list[TranscriptEntry]) -> list[str]: for entry in entries: for line in self._prepare_text(entry.text).splitlines(): if line.strip(): - lines.append(f"{self.indent}{line}") + if self._is_markdown: + lines.append(f"> {line}") + else: + lines.append(f"{self.indent}{line}") return lines def _group_chat_entries(self, entries: list[TranscriptEntry]) -> list[TurnBlock]: @@ -212,6 +227,11 @@ def _prepare_text(self, text: str) -> str: if self.text_output_config.strip_chatgpt_artifacts: prepared = CHATGPT_ARTIFACT_RE.sub("", prepared) prepared = re.sub(r"\n{3,}", "\n\n", prepared) + if self._is_markdown: + prepared = re.sub( + r"^[ \t]*[-*_]{3,}[ \t]*$", "", prepared, flags=re.MULTILINE + ) + prepared = re.sub(r"\n{3,}", "\n\n", prepared) return prepared.strip() def _render_section_heading(self, title: str) -> str: @@ -271,7 +291,13 @@ def _render_header(self, conv: dict[str, Any]) -> list[str]: value = self._get_header_value(conv, field_name) if value is None: continue - lines.append(f"{self._get_header_label(field_name)}: {value}") + if self._is_markdown: + if field_name == "title": + lines.append(f"# {value}") + else: + lines.append(f"**{self._get_header_label(field_name)}:** {value}") + else: + lines.append(f"{self._get_header_label(field_name)}: {value}") return lines def _get_header_label(self, field_name: str) -> str: @@ -335,6 +361,7 @@ def format_conversation(self, conv: dict[str, Any]) -> str: FORMATTERS = { + "md": TextFormatter, "txt": TextFormatter, "json": JSONFormatter, } @@ -360,6 +387,18 @@ def get_formatter(format_type: str, **kwargs: Any) -> BaseFormatter: formatter_kwargs = kwargs if format_type == "json": formatter_kwargs = {} + elif format_type == "md": + # Override text_output_config to force markdown heading style + text_output_config = formatter_kwargs.get("text_output_config") + if text_output_config is None: + from chatgpt_export_tool.core.config.runtime import TextOutputConfig + + text_output_config = TextOutputConfig(heading_style="markdown") + elif text_output_config.heading_style != "markdown": + from dataclasses import replace + + text_output_config = replace(text_output_config, heading_style="markdown") + formatter_kwargs["text_output_config"] = text_output_config formatter = FORMATTERS[format_type](**formatter_kwargs) logger.debug("Created formatter %s", type(formatter).__name__) return formatter diff --git a/chatgpt_export_tool/core/output/naming.py b/chatgpt_export_tool/core/output/naming.py index b02e2bf..91921aa 100644 --- a/chatgpt_export_tool/core/output/naming.py +++ b/chatgpt_export_tool/core/output/naming.py @@ -42,7 +42,7 @@ def sanitize(self, title: Optional[str]) -> str: return sanitized or "untitled" - def get_filename(self, stem: Any, extension: str = "txt") -> str: + def get_filename(self, stem: Any, extension: str = "md") -> str: """Build a filename from a stem and extension. Args: diff --git a/chatgpt_export_tool/core/output/paths.py b/chatgpt_export_tool/core/output/paths.py index 1fb5e21..7ad7cf4 100644 --- a/chatgpt_export_tool/core/output/paths.py +++ b/chatgpt_export_tool/core/output/paths.py @@ -17,7 +17,7 @@ class OutputPathResolver: def __init__( self, output_dir: str | Path = "output", - format_type: str = "txt", + format_type: str = "md", split_mode: Optional[SplitMode] = None, file_namer: Optional[FileNamer] = None, ) -> None: diff --git a/chatgpt_export_tool/core/output/writer.py b/chatgpt_export_tool/core/output/writer.py index 22e9832..a924d29 100644 --- a/chatgpt_export_tool/core/output/writer.py +++ b/chatgpt_export_tool/core/output/writer.py @@ -58,7 +58,7 @@ class OutputWriter: def __init__( self, output_dir: str = "output", - format_type: str = "txt", + format_type: str = "md", split_mode: Optional[SplitMode] = None, ) -> None: """Initialize an output writer. diff --git a/tests/test_output_writer.py b/tests/test_output_writer.py index f296e9c..4498ef5 100644 --- a/tests/test_output_writer.py +++ b/tests/test_output_writer.py @@ -97,13 +97,13 @@ def test_get_filename(self): assert filename == "My_Conversation.txt" def test_get_filename_default_extension(self): - """Test get_filename uses 'txt' as default extension.""" + """Test get_filename uses 'md' as default extension.""" namer = FileNamer() conv = {"title": "Test"} filename = namer.get_filename(conv) - assert filename == "Test.txt" + assert filename == "Test.md" def test_get_filename_falls_back_to_untitled(self): """Test get_filename uses 'untitled' when no title.""" @@ -112,7 +112,7 @@ def test_get_filename_falls_back_to_untitled(self): conv = {} filename = namer.get_filename(conv) - assert filename == "untitled.txt" + assert filename == "untitled.md" class TestWriteResult: @@ -170,7 +170,7 @@ def test_init_default_values(self): writer = OutputWriter() assert writer.output_dir == Path("output") - assert writer.format_type == "txt" + assert writer.format_type == "md" assert writer.split_mode is None def test_init_with_values(self): @@ -193,7 +193,7 @@ def test_path_resolution_subject_mode_uses_title_plus_id(self): path = resolver.get_filepath({"title": "Test", "id": "123"}, "Test_123") - assert path == Path("/output/Test_123.txt") + assert path == Path("/output/Test_123.md") def test_write_conversations_single_group(self, tmp_path): """Test write_jobs with two conversations.""" diff --git a/tests/test_runtime_config.py b/tests/test_runtime_config.py index 0f0d769..525d9f7 100644 --- a/tests/test_runtime_config.py +++ b/tests/test_runtime_config.py @@ -19,7 +19,7 @@ def test_load_runtime_config_defaults_when_no_file( config = load_runtime_config() - assert config.defaults.format_type == "txt" + assert config.defaults.format_type == "md" assert config.transcript.show_assistant_thoughts is True assert config.text_output.header_fields == ("title", "id", "create_time") assert config.source_path is None @@ -69,7 +69,7 @@ def test_load_runtime_config_does_not_implicitly_read_cwd_file( config = load_runtime_config() - assert config.defaults.format_type == "txt" + assert config.defaults.format_type == "md" assert config.source_path is None diff --git a/tests/test_runtime_contract.py b/tests/test_runtime_contract.py index b27238a..8bec08c 100644 --- a/tests/test_runtime_contract.py +++ b/tests/test_runtime_contract.py @@ -352,8 +352,8 @@ def test_single_split_without_output_writes_to_stdout( captured = capsys.readouterr() assert exit_code == 0 - assert "Title: Alpha" in captured.out - assert "Title: Beta" in captured.out + assert "# Alpha" in captured.out + assert "# Beta" in captured.out assert "Exported 2 files" not in captured.out assert not output_dir.exists() @@ -373,7 +373,7 @@ def test_single_split_with_output_writes_one_file(self, tmp_path: Path) -> None: assert exit_code == 0 assert output_file.exists() content = output_file.read_text(encoding="utf-8") - assert "Title: Alpha" in content + assert "# Alpha" in content def test_single_json_output_is_valid_json(self, tmp_path: Path) -> None: """Single JSON export writes one valid JSON document.""" @@ -499,7 +499,7 @@ def test_split_subject_naming_uses_stable_source_fields( assert exit_code == 0 names = sorted(path.name for path in output_dir.iterdir()) - assert names == ["Same_conv-1.txt", "Same_conv-2.txt"] + assert names == ["Same_conv-1.md", "Same_conv-2.md"] def test_split_write_failures_return_non_zero_exit_code( self, tmp_path: Path