From 661f5846bf676e6f530594c81df14d50c2a5c011 Mon Sep 17 00:00:00 2001
From: Void Freud
Date: Fri, 27 Mar 2026 21:33:22 +0200
Subject: [PATCH 1/2] Use UTC timestamps, configurable analysis dates, and add
CI/publish workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Switch format_timestamp and date_group_key to UTC for reproducible output
- Make analysis date format configurable via AnalyzeConfig.time_format
- Add GitHub Actions CI workflow (test + lint across Python 3.10–3.13)
- Add GitHub Actions publish workflow (trusted PyPI publishing on release)
Co-Authored-By: Claude Opus 4.6
---
.github/workflows/ci.yml | 36 +++++++++++++++++++
.github/workflows/publish.yml | 28 +++++++++++++++
chatgpt_export_tool/commands/analyze.py | 11 ++++--
.../core/analysis_formatter.py | 8 +++--
chatgpt_export_tool/core/output/paths.py | 4 ++-
chatgpt_export_tool/core/transcript/access.py | 6 ++--
chatgpt_export_tool/core/utils.py | 10 +++---
7 files changed, 90 insertions(+), 13 deletions(-)
create mode 100644 .github/workflows/ci.yml
create mode 100644 .github/workflows/publish.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..3c0a2ed
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,36 @@
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v4
+
+ - name: Install dependencies
+ run: uv sync --group dev
+
+ - name: Lint
+ run: uv run ruff check chatgpt_export_tool tests
+
+ - name: Format check
+ run: uv run ruff format --check chatgpt_export_tool tests
+
+ - name: Test
+ run: uv run pytest --cov=chatgpt_export_tool --cov-report=term-missing
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..d7becf6
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,28 @@
+name: Publish to PyPI
+
+on:
+ release:
+ types: [published]
+
+permissions:
+ id-token: write
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+ environment: pypi
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+
+ - name: Install build tools
+ run: pip install build
+
+ - name: Build package
+ run: python -m build
+
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/chatgpt_export_tool/commands/analyze.py b/chatgpt_export_tool/commands/analyze.py
index 28a9590..b2d429c 100644
--- a/chatgpt_export_tool/commands/analyze.py
+++ b/chatgpt_export_tool/commands/analyze.py
@@ -1,7 +1,7 @@
"""Analyze command for chatgpt_export_tool."""
import argparse
-from datetime import datetime
+from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
@@ -11,6 +11,7 @@
AnalyzeConfig,
format_analysis_text,
)
+from chatgpt_export_tool.core.config.models import DEFAULT_TIME_FORMAT
from chatgpt_export_tool.core.file_utils import get_file_size
from chatgpt_export_tool.core.parser import JSONParser
from chatgpt_export_tool.core.utils import format_size
@@ -54,11 +55,15 @@ def _execute(self) -> None:
results = parser.analyze(verbose=self.logger.level <= 20)
results["file_size"] = format_size(file_size)
results["filepath"] = self.filepath
- results["analysis_date"] = datetime.now().strftime("%H:%M %d-%m-%Y")
+ time_format = DEFAULT_TIME_FORMAT
+ results["analysis_date"] = datetime.now(tz=timezone.utc).strftime(time_format)
output = format_analysis_text(
results,
- AnalyzeConfig(include_fields=self.include_fields),
+ AnalyzeConfig(
+ include_fields=self.include_fields,
+ time_format=time_format,
+ ),
)
if self.output_file:
diff --git a/chatgpt_export_tool/core/analysis_formatter.py b/chatgpt_export_tool/core/analysis_formatter.py
index 86453eb..5dfd8e4 100644
--- a/chatgpt_export_tool/core/analysis_formatter.py
+++ b/chatgpt_export_tool/core/analysis_formatter.py
@@ -16,9 +16,11 @@ class AnalyzeConfig:
Attributes:
include_fields: Whether to include field coverage details.
+ time_format: strftime format for date/time display.
"""
include_fields: bool = False
+ time_format: str = "%H:%M %d-%m-%Y"
def format_analysis_text(
@@ -56,8 +58,10 @@ def format_analysis_text(
lines.append(f"Total message nodes in mappings: {results['message_count']:,}")
if results.get("min_date") is not None and results.get("max_date") is not None:
- lines.append(f"From: {format_timestamp(results['min_date'])}")
- lines.append(f"To: {format_timestamp(results['max_date'])}")
+ lines.append(
+ f"From: {format_timestamp(results['min_date'], config.time_format)}"
+ )
+ lines.append(f"To: {format_timestamp(results['max_date'], config.time_format)}")
lines.append("")
diff --git a/chatgpt_export_tool/core/output/paths.py b/chatgpt_export_tool/core/output/paths.py
index 65e7a0b..1fb5e21 100644
--- a/chatgpt_export_tool/core/output/paths.py
+++ b/chatgpt_export_tool/core/output/paths.py
@@ -94,4 +94,6 @@ def get_unique_filepath(
candidate = filepath.with_name(f"{filepath.stem}_{suffix}{filepath.suffix}")
if candidate not in used_paths and not candidate.exists():
return candidate
- raise RuntimeError(f"Could not find unique path for {filepath} after 10000 attempts")
+ raise RuntimeError(
+ f"Could not find unique path for {filepath} after 10000 attempts"
+ )
diff --git a/chatgpt_export_tool/core/transcript/access.py b/chatgpt_export_tool/core/transcript/access.py
index 4b117ab..f375955 100644
--- a/chatgpt_export_tool/core/transcript/access.py
+++ b/chatgpt_export_tool/core/transcript/access.py
@@ -1,6 +1,6 @@
"""Shared read-only helpers for conversation structures."""
-from datetime import datetime
+from datetime import datetime, timezone
from typing import Any, Iterator, Optional
from .thread import (
@@ -86,7 +86,9 @@ def get_date_group_key(conversation: dict[str, Any]) -> Optional[str]:
return None
try:
- return datetime.fromtimestamp(float(create_time)).strftime("%Y-%m-%d")
+ return datetime.fromtimestamp(float(create_time), tz=timezone.utc).strftime(
+ "%Y-%m-%d"
+ )
except (TypeError, ValueError, OSError):
return None
diff --git a/chatgpt_export_tool/core/utils.py b/chatgpt_export_tool/core/utils.py
index 8240089..fd069d0 100644
--- a/chatgpt_export_tool/core/utils.py
+++ b/chatgpt_export_tool/core/utils.py
@@ -1,6 +1,6 @@
"""Small shared formatting helpers."""
-from datetime import datetime
+from datetime import datetime, timezone
def format_size(size_bytes: int) -> str:
@@ -21,14 +21,14 @@ def format_size(size_bytes: int) -> str:
def format_timestamp(timestamp: float, time_format: str = "%H:%M %d-%m-%Y") -> str:
- """Format a Unix timestamp to human-readable date string.
+ """Format a Unix timestamp to a UTC date string.
Args:
timestamp: Unix timestamp (seconds since epoch). May be float, int, or Decimal.
+ time_format: strftime format string.
Returns:
- Formatted date string.
+ Formatted UTC date string.
"""
- # Handle Decimal values from JSON parser
- dt = datetime.fromtimestamp(float(timestamp))
+ dt = datetime.fromtimestamp(float(timestamp), tz=timezone.utc)
return dt.strftime(time_format)
From 7016eb3294f0540ecf44ee2a97ea7555a074f2c1 Mon Sep 17 00:00:00 2001
From: Void Freud
Date: Sat, 28 Mar 2026 00:27:34 +0200
Subject: [PATCH 2/2] Default to Markdown output format with proper md syntax
- Add md as a first-class format (default), alongside txt and json
- Markdown output uses # headings, **bold** metadata, > blockquoted
context, --- separators, and no indentation
- get_formatter auto-applies markdown heading style for md format
- txt format still works unchanged via --format txt
- Rewrite README and Fields.md with polished, professional styling
Co-Authored-By: Claude Opus 4.6
---
Fields.md | 233 ++++-------
README.md | 362 +++++++++---------
chatgpt_export.toml.example | 4 +-
chatgpt_export_tool/commands/export.py | 6 +-
chatgpt_export_tool/core/config/models.py | 2 +-
chatgpt_export_tool/core/config/validation.py | 2 +-
chatgpt_export_tool/core/export_service.py | 2 +-
chatgpt_export_tool/core/output/formatters.py | 49 ++-
chatgpt_export_tool/core/output/naming.py | 2 +-
chatgpt_export_tool/core/output/paths.py | 2 +-
chatgpt_export_tool/core/output/writer.py | 2 +-
tests/test_output_writer.py | 10 +-
tests/test_runtime_config.py | 4 +-
tests/test_runtime_contract.py | 8 +-
14 files changed, 338 insertions(+), 350 deletions(-)
diff --git a/Fields.md b/Fields.md
index 1e0caa2..67456fe 100644
--- a/Fields.md
+++ b/Fields.md
@@ -1,158 +1,109 @@
# Field Selection Reference
-This document describes the field-selection and metadata-selection features that the current CLI actually supports.
+Practical reference for the `--fields` and `--include`/`--exclude` options in `chatgpt-export`.
-It is intentionally practical rather than exhaustive. The goal is to document the fields, groups, and selectors you can use with `chatgpt-export`, not to guess every field that might appear in every historical `conversations.json` file.
+---
-## Structural Levels
+## How Data Is Structured
-The tool understands conversation data at these nested levels:
+ChatGPT exports nest data at these levels:
-```text
+```
conversation
└── mapping node
└── message
- ├── author
- ├── content
- └── metadata
+ ├── author (role, name)
+ ├── content (content_type, parts, text, ...)
+ └── metadata (model_slug, message_type, ...)
```
-The field selector can retain or remove fields across those levels while preserving the containers needed to reach nested selected fields.
-
-Text export is transcript-oriented: it follows the active branch defined by `current_node` and `parent` links, then applies transcript visibility rules from the TOML config passed to `export`.
-
-## `--fields`
+The field selector retains or removes fields across these levels while preserving the containers needed to reach any selected nested field.
-The `--fields` argument accepts one field-selection spec.
+Text/Markdown export is **transcript-oriented** — it follows the active branch via `current_node` and `parent` links, then applies visibility rules from the TOML config.
-Supported forms:
+---
-```text
-all
-none
-include field1,field2
-exclude field1,field2
-groups group1,group2
-```
+## `--fields`
-Examples:
+Controls which structural fields survive before formatting.
```bash
-chatgpt-export export data.json --fields all
-chatgpt-export export data.json --fields none
-chatgpt-export export data.json --fields "include title,create_time,mapping"
-chatgpt-export export data.json --fields "exclude moderation_results,plugin_ids"
-chatgpt-export export data.json --fields "groups minimal"
-chatgpt-export export data.json --fields "groups conversation,message"
+--fields all # keep everything (default)
+--fields none # empty structure
+--fields "include title,create_time,mapping" # whitelist
+--fields "exclude moderation_results" # blacklist
+--fields "groups minimal" # named group
+--fields "groups conversation,message" # combine groups
```
-Multi-word specs must be quoted.
+> Multi-word specs must be quoted.
-## Field Groups
+---
-The current built-in field groups are:
+## Built-in Field Groups
### `conversation`
-Includes:
-
-- `_id`
-- `conversation_id`
-- `create_time`
-- `update_time`
-- `title`
-- `type`
+`_id` · `conversation_id` · `create_time` · `update_time` · `title` · `type`
### `message`
-Includes:
-
-- `author`
-- `content`
-- `status`
-- `end_turn`
+`author` · `content` · `status` · `end_turn`
### `metadata`
-Includes:
-
-- `model_slug`
-- `message_type`
-- `is_archived`
+`model_slug` · `message_type` · `is_archived`
### `minimal`
-Includes:
+`title` · `create_time` · `message`
-- `title`
-- `create_time`
-- `message`
+---
-## Known Structural Fields
+## All Known Structural Fields
-These are the structural fields the tool currently categorizes by level.
+
+Conversation level
-### Conversation
+`title` · `create_time` · `update_time` · `mapping` · `moderation_results` · `current_node` · `plugin_ids` · `_id` · `conversation_id` · `type`
-- `title`
-- `create_time`
-- `update_time`
-- `mapping`
-- `moderation_results`
-- `current_node`
-- `plugin_ids`
-- `_id`
-- `conversation_id`
-- `type`
+
-### Mapping Node
+
+Mapping node level
-- `id`
-- `parent`
-- `children`
-- `message`
+`id` · `parent` · `children` · `message`
-### Message
+
-- `author`
-- `content`
-- `status`
-- `end_turn`
-- `weight`
-- `recipient`
-- `channel`
-- `create_time`
-- `update_time`
+
+Message level
-### Author
+`author` · `content` · `status` · `end_turn` · `weight` · `recipient` · `channel` · `create_time` · `update_time`
-- `role`
-- `name`
+
-### Content
+
+Author
-- `content_type`
-- `parts`
-- `language`
-- `response_format_name`
-- `text`
-- `user_profile`
-- `user_instructions`
+`role` · `name`
-Unknown names are still allowed in `include` and `exclude` field specs, but the validator may warn about them.
+
-## Metadata Filtering
+
+Content
-Metadata filtering is separate from `--fields`.
+`content_type` · `parts` · `language` · `response_format_name` · `text` · `user_profile` · `user_instructions`
-Use:
+
-- `--include PATTERN [PATTERN ...]`
-- `--exclude PATTERN [PATTERN ...]`
+> Unknown field names are allowed in `include`/`exclude` specs, but the validator may warn about them.
-These apply to known metadata names inside nested `message.metadata` dictionaries after structural field filtering.
+---
-Examples:
+## Metadata Filtering
+
+Separate from `--fields`. Applies only to keys inside `message.metadata` dictionaries, *after* structural filtering.
```bash
chatgpt-export export data.json --include model_slug
@@ -160,73 +111,51 @@ chatgpt-export export data.json --include "model*" --exclude plugin_ids
chatgpt-export export data.json --fields "groups message" --include is_archived
```
-Pattern matching supports:
-
-- exact matches
-- substring matches
-- shell-style wildcards such as `model*`
-
-## Known Metadata Names
+**Pattern matching:** exact, substring, and shell-style globs (`model*`).
-The current metadata filter recognizes these names:
+**Known metadata names:** `model_slug` · `message_type` · `plugin_ids` · `is_archived`
-- `model_slug`
-- `message_type`
-- `plugin_ids`
-- `is_archived`
+---
-## How Filtering Combines
+## Filtering Pipeline
-Filtering happens in this order:
-
-1. structural field selection through `--fields`
-2. metadata filtering through `--include` and `--exclude`
-3. formatting to text or JSON
-
-This means:
+```
+conversations.json
+ → 1. Structural field selection (--fields)
+ → 2. Metadata filtering (--include / --exclude)
+ → 3. Formatting (md / txt / json)
+ → output
+```
-- `--fields` decides whether structural containers like `mapping`, `message`, `author`, `content`, and `metadata` survive
-- `--include` and `--exclude` decide which metadata keys remain inside metadata dictionaries
+`--fields` decides whether containers like `mapping`, `message`, and `metadata` survive.
+`--include`/`--exclude` decides which keys remain inside those metadata containers.
-## Practical Recipes
+---
-Keep only a small readable subset:
+## Recipes
```bash
+# Minimal readable export
chatgpt-export export data.json --fields "groups minimal"
-```
-
-Keep titles and timestamps but drop plugin noise:
-```bash
+# Drop noise, keep structure
chatgpt-export export data.json --fields "exclude plugin_ids,moderation_results"
-```
-
-Keep only message-oriented structure and model metadata:
-```bash
+# Message structure + model info only
chatgpt-export export data.json --fields "groups message" --include "model*"
-```
-Write one file per conversation with a minimal payload:
-
-```bash
+# One file per conversation, minimal payload
chatgpt-export export data.json --split subject --output-dir exports --fields "groups minimal"
```
+---
+
## Notes
-- `analyze --fields` reports field coverage; it does not accept the export-style field-selection spec.
-- `export` can load defaults from a TOML file via `--config PATH`.
-- The repo ships `chatgpt_export.toml.example` as a template; copy it to a local file before use.
-- `export --split single` writes to stdout unless `--output` is provided.
-- Subject split files are named from the source conversation title plus identifier.
-- Split modes such as `subject`, `date`, and `id` write to `--output-dir`.
-- Text export follows the active conversation branch and is configurable through the `[transcript]` and `[text_output]` TOML sections.
-- Default text export shows user text, assistant text, assistant thoughts, and a compact preview of `user_editable_context`.
-- Default text export hides assistant code, reasoning recap, and tool plumbing unless the transcript policy explicitly enables them.
-- Advanced transcript controls include `user_editable_context_mode`, `show_visually_hidden_content_types`, `include_content_types`, and `exclude_content_types`.
-- Text layout controls include `layout_mode`, `heading_style`, `include_turn_count_in_header`, `include_turn_numbers`, `turn_separator`, `strip_chatgpt_artifacts`, and `wrap_width`.
-- A practical default is `layout_mode = "reading"` with `turn_separator = "---"` and artifact stripping enabled.
-- For tighter exports, use `layout_mode = "compact"` and disable turn counts.
-- For notes-oriented output, use `heading_style = "markdown"`.
+- `analyze --fields` reports field *coverage* — it does not use the export-style field-selection spec.
+- `export --split single` writes to stdout unless `--output` is given.
+- Subject split files are named `Title_ID.md` (or `.txt`/`.json`).
+- Default Markdown export shows user text, assistant text, thoughts, and compact context previews.
+- Hidden by default: assistant code, reasoning recap, tool plumbing.
+- All transcript visibility is configurable via `[transcript]` in the TOML config.
+- Text layout is configurable via `[text_output]`: layout mode, heading style, wrap width, separators, turn numbering.
diff --git a/README.md b/README.md
index d1e1820..510128e 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,27 @@
-# ChatGPT Export Tool
+
+
ChatGPT Export Tool
+
+ Stream, analyze, and export your ChatGPT history — without loading it all into memory.
+
+
+
+
+
+
+
+
-[](https://www.python.org/)
-[](LICENSE)
-[](https://github.com/astral-sh/ruff)
+
-A CLI for analyzing and exporting ChatGPT `conversations.json` files.
+A Python CLI that takes your `conversations.json` export from ChatGPT and turns it into clean, readable **Markdown**, plain text, or structured JSON — with full control over what gets included.
-The project focuses on two things:
+It uses **streaming JSON parsing** ([ijson](https://github.com/ICRAR/ijson)) so even multi-hundred-megabyte exports never need to be loaded into memory. Filtering, formatting, and output are all modular and independently configurable through CLI flags or a single TOML config file.
-- analyzing the structure of a ChatGPT export without loading the whole file into memory
-- exporting conversations to text or JSON with structural field filtering and metadata filtering
-
-It uses streaming JSON parsing with `ijson` and is organized around small core modules so filtering, formatting, split behavior, and path generation can be changed independently.
-
-Persistent defaults can be stored in a single TOML config file. The repo ships a template at `chatgpt_export.toml.example`.
+---
## Installation
-This project currently targets Python `3.10+`.
+> **Requires Python `3.10+`** · Managed with [uv](https://docs.astral.sh/uv/)
```bash
git clone https://github.com/voidfreud/chatgpt-export-tool.git
@@ -25,67 +29,51 @@ cd chatgpt-export-tool
uv sync
```
-For development tooling too:
+For dev tooling (pytest, ruff, coverage):
```bash
uv sync --group dev
```
-You can then run the CLI with:
+Verify the install:
```bash
uv run chatgpt-export --help
```
-To apply config defaults, copy the template and pass `--config PATH`.
+---
## Quick Start
-Analyze an export:
-
-```bash
-uv run chatgpt-export analyze path/to/conversations.json
-```
-
-Include field coverage:
-
```bash
-uv run chatgpt-export analyze path/to/conversations.json --fields
-```
+# Analyze your export — stats without loading the whole file
+uv run chatgpt-export analyze conversations.json
-Export everything as text to stdout:
+# Export everything to markdown (default) on stdout
+uv run chatgpt-export export conversations.json
-```bash
-uv run chatgpt-export export path/to/conversations.json
-```
+# Export to a single file
+uv run chatgpt-export export conversations.json --output all_chats.md
-Export everything as JSON to one file:
+# One markdown file per conversation
+uv run chatgpt-export export conversations.json --split subject --output-dir exports/
-```bash
-uv run chatgpt-export export path/to/conversations.json --format json --output conversations.json
+# JSON export
+uv run chatgpt-export export conversations.json --format json --output dump.json
```
-Export one file per conversation:
-
-```bash
-uv run chatgpt-export export path/to/conversations.json --split subject --output-dir exports
-```
+---
## Commands
### `analyze`
-`analyze` reports high-level structure and statistics for a `conversations.json` file.
-
-It includes:
-
-- conversation count
-- message count
-- file size
-- date range
-- optional field coverage with `--fields`
+Reports structure and statistics for a `conversations.json` file without writing any output files.
-Examples:
+| What it shows | Flag |
+|---|---|
+| Conversation & message counts, file size, date range | *(default)* |
+| Field coverage per structural level | `--fields` |
```bash
uv run chatgpt-export analyze data.json
@@ -96,156 +84,188 @@ uv run chatgpt-export analyze data.json --debug
### `export`
-`export` writes conversations in either text or JSON format.
+Converts conversations into **Markdown** (default), **plain text**, or **JSON** with fine-grained control over structure, metadata, and output layout.
-It supports:
+```bash
+# Minimal readable export
+uv run chatgpt-export export data.json --fields "groups minimal"
-- structural field filtering through `--fields`
-- metadata filtering through `--include` and `--exclude`
-- transcript-oriented text export that follows the active branch
-- split modes for one output, one file per conversation, date folders, or ID-based files
+# Full JSON, one file per conversation by date
+uv run chatgpt-export export data.json --format json --split date --output-dir by-date/
-Examples:
+# Selective metadata
+uv run chatgpt-export export data.json --include "model*" --exclude plugin_ids
-```bash
-uv run chatgpt-export export data.json
-uv run chatgpt-export export data.json --output conversations.txt
-uv run chatgpt-export export data.json --format json --output conversations.json
-uv run chatgpt-export export data.json --split subject --output-dir exports
-uv run chatgpt-export export data.json --fields "groups minimal" --split subject --output-dir exports
-uv run chatgpt-export export data.json --fields "include title,mapping" --include "model*" --exclude plugin_ids
+# Use a config file for persistent defaults
cp chatgpt_export.toml.example chatgpt_export.toml
uv run chatgpt-export export data.json --config chatgpt_export.toml
```
-## Field Filtering
+---
-The `--fields` option controls which structural fields are retained before formatting.
+## Output Formats
-Supported forms:
+| Format | Flag | Extension | Description |
+|---|---|---|---|
+| **Markdown** | `--format md` | `.md` | Transcript-oriented with `#` headings, `>` blockquoted context, `---` separators. **Default.** |
+| **Plain text** | `--format txt` | `.txt` | Indented text with plain labels — good for terminals and grep. |
+| **JSON** | `--format json` | `.json` | Filtered conversation objects written as valid JSON. |
-- `all`
-- `none`
-- `include field1,field2`
-- `exclude field1,field2`
-- `groups group1,group2`
+Markdown and text exports follow the **active conversation branch** using `current_node` and `parent` links, so you see the conversation as it actually played out — not the full tree with all edits and branches.
-Examples:
+
+What the Markdown output looks like
-```bash
-uv run chatgpt-export export data.json --fields all
-uv run chatgpt-export export data.json --fields none
-uv run chatgpt-export export data.json --fields "include title,create_time,mapping"
-uv run chatgpt-export export data.json --fields "exclude moderation_results,plugin_ids"
-uv run chatgpt-export export data.json --fields "groups minimal"
+```markdown
+# Opening bank account in Thailand
+**ID:** 68ed8eba-1d00-832a-a2f1-4721496ed217
+**Created:** 23:43 13-10-2025
+**Turns:** 34
+
+## Conversation Context
+> User profile: The user provided the following information...
+> User instructions: Speaking to Nova. Zero restraint...
+
+## User [23:43 13-10-2025]
+Can a foreigner open a Thailand bank account?
+
+## Assistant [23:44 13-10-2025]
+Yup — a foreigner *can* open a bank account in Thailand, but it's
+*much more difficult* now than it used to be...
```
-Available field groups:
+
+
+Default transcript behavior:
-- `conversation`
-- `message`
-- `metadata`
-- `minimal`
+- **Shown:** user text, assistant text, assistant thoughts, user editable context (compact preview)
+- **Hidden:** tool plumbing, assistant code, reasoning recap, blank nodes
-See [Fields.md](Fields.md) for the current field-selection reference.
+All of this is configurable via the `[transcript]` section in the TOML config.
-## Metadata Filtering
+---
-The metadata filter runs after structural field filtering and applies only to keys inside nested `message.metadata` dictionaries.
+## Filtering
-Examples:
+### Structural Fields — `--fields`
+
+Controls which parts of each conversation object are retained before formatting.
```bash
-uv run chatgpt-export export data.json --include model_slug
-uv run chatgpt-export export data.json --include "model*" --exclude plugin_ids
-uv run chatgpt-export export data.json --fields "groups message" --include is_archived
+--fields all # everything (default)
+--fields none # structure only
+--fields "include title,create_time,mapping" # keep only these
+--fields "exclude moderation_results" # drop these
+--fields "groups minimal" # use a named group
```
-Currently supported metadata names include:
+**Built-in groups:**
-- `model_slug`
-- `message_type`
-- `plugin_ids`
-- `is_archived`
+| Group | Fields |
+|---|---|
+| `conversation` | `_id`, `conversation_id`, `create_time`, `update_time`, `title`, `type` |
+| `message` | `author`, `content`, `status`, `end_turn` |
+| `metadata` | `model_slug`, `message_type`, `is_archived` |
+| `minimal` | `title`, `create_time`, `message` |
-## Split Modes
+See [`Fields.md`](Fields.md) for the full field-selection reference.
-`export` supports four split modes:
+### Metadata — `--include` / `--exclude`
-- `single`: one combined output stream or one output file
-- `subject`: one file per conversation, named from title plus identifier
-- `date`: date folders with one file per conversation
-- `id`: one file per conversation, named from conversation ID
+Runs *after* structural filtering. Applies only to keys inside nested `message.metadata` dictionaries.
-Important output behavior:
+```bash
+--include model_slug # keep only model_slug
+--include "model*" --exclude plugin_ids # glob patterns supported
+```
-- `--split single` with no `--output` writes to stdout
-- `--split single --output FILE` writes one file
-- split modes like `subject`, `date`, and `id` write into `--output-dir`
+Known metadata names: `model_slug`, `message_type`, `plugin_ids`, `is_archived`.
-## Output Formats
+---
-Supported formats:
+## Split Modes
-- `txt`
-- `json`
+Control how conversations are distributed across output files.
-`txt` is a transcript-oriented export that follows the active branch of the conversation tree.
-`json` writes the filtered conversation objects directly.
+| Mode | Flag | Behavior |
+|---|---|---|
+| **Single** | `--split single` | One combined stream or file *(default)* |
+| **Subject** | `--split subject` | One file per conversation, named `Title_ID.md` |
+| **Date** | `--split date` | Daily folders → one file per conversation |
+| **ID** | `--split id` | One file per conversation, named by conversation ID |
-By default, text export includes user text, assistant text, assistant thoughts, and user editable context when present. User editable context is rendered in a compact preview by default so transcripts stay readable. Text export hides tool plumbing, assistant code, reasoning recap, and blank/internal nodes unless the transcript policy is changed in config.
+```bash
+# Stdout (single, no --output)
+uv run chatgpt-export export data.json
-Text output defaults now favor reading clarity:
+# Single file
+uv run chatgpt-export export data.json --output all.md
-- conversation context is rendered as a separate preamble block
-- visible turns are grouped into clearer chat-style `User` / `Assistant` sections
-- turn counts can be shown in the header
-- ChatGPT citation/navigation artifacts can be stripped from text output
-- long paragraphs can be wrapped for easier reading
+# Split into a directory
+uv run chatgpt-export export data.json --split subject --output-dir exports/
+```
-Important transcript policy options include:
+> **Note:** `--output` is for single mode only. Split modes use `--output-dir`.
-- `user_editable_context_mode`
-- `show_visually_hidden_content_types`
-- `include_content_types`
-- `exclude_content_types`
+---
## Configuration
-`export` accepts `--config PATH` and resolves defaults from one TOML file.
+All export behavior can be persisted in a single TOML file. The repo ships [`chatgpt_export.toml.example`](chatgpt_export.toml.example) as a starting point.
+
+```bash
+cp chatgpt_export.toml.example chatgpt_export.toml
+# edit to taste, then:
+uv run chatgpt-export export data.json --config chatgpt_export.toml
+```
+
+CLI flags always override TOML values.
+
+
+TOML sections overview
+
+**`[defaults]`** — format, split mode, field selection, output directory, metadata filters
-The repo ships `chatgpt_export.toml.example` as a template. Copy it to a local file such as `chatgpt_export.toml` and pass that path explicitly.
+**`[transcript]`** — branch following, visibility rules per content type
-The config file is TOML and is intentionally kept to one file with sections such as:
+| Key | Default | What it does |
+|---|---|---|
+| `show_system_messages` | `false` | Include system prompts |
+| `show_tool_messages` | `false` | Include tool/function calls |
+| `show_assistant_code` | `false` | Include code execution blocks |
+| `show_reasoning_recap` | `false` | Include reasoning summaries |
+| `user_editable_context_mode` | `"compact"` | `"compact"` or `"full"` for context rendering |
+| `include_content_types` | `[]` | Whitelist specific content types |
+| `exclude_content_types` | `[]` | Blacklist specific content types |
-- `[defaults]` for format, split mode, field selection, and output directory
-- `[transcript]` for active-branch reconstruction and visibility rules
-- `[text_output]` for header fields, transcript layout, and date/time formats
+**`[text_output]`** — header, layout, formatting for text/markdown output
-Notable `[text_output]` options include:
+| Key | Default | What it does |
+|---|---|---|
+| `layout_mode` | `"reading"` | `"reading"` (spacious) or `"compact"` (dense) |
+| `heading_style` | `"markdown"` | `"markdown"` (with `#`) or `"plain"` |
+| `turn_separator` | `"---"` | Separator between turns |
+| `strip_chatgpt_artifacts` | `true` | Remove ChatGPT citation/nav artifacts |
+| `wrap_width` | `88` | Line wrap width (`0` to disable) |
+| `include_turn_count_in_header` | `true` | Show turn count in header |
+| `include_turn_numbers` | `false` | Number each turn |
-- `layout_mode = "reading" | "compact"`
-- `heading_style = "plain" | "markdown"`
-- `include_turn_count_in_header = true | false`
-- `include_turn_numbers = true | false`
-- `turn_separator = "---"`
-- `strip_chatgpt_artifacts = true | false`
-- `wrap_width = 88`
+
-Practical transcript presets:
+
+Config presets
-Reading-first transcript:
+**Reading-first** *(default)*
```toml
[text_output]
layout_mode = "reading"
-heading_style = "plain"
-include_turn_count_in_header = true
+heading_style = "markdown"
turn_separator = "---"
strip_chatgpt_artifacts = true
wrap_width = 88
```
-Compact scanning transcript:
+**Compact scanning**
```toml
[text_output]
layout_mode = "compact"
@@ -254,55 +274,55 @@ turn_separator = ""
wrap_width = 0
```
-Markdown/notes transcript:
+**Plain text / terminal**
```toml
+[defaults]
+format = "txt"
+
[text_output]
layout_mode = "reading"
-heading_style = "markdown"
+heading_style = "plain"
turn_separator = "---"
```
-CLI arguments override TOML values. `analyze` does not currently use export config defaults.
-
-## Architecture
+
-The structure is intentionally modular at the subsystem level:
+---
-- command wiring and user-facing behavior live in `chatgpt_export_tool/commands/`
-- streaming parse and analysis are separate from export formatting and writing
-- structural field filtering and metadata filtering are separate concerns
-- split-key resolution, filename policy, and writing are isolated from export orchestration
+## Architecture
-The core package is also grouped into shallow subpackages by concern:
+```
+chatgpt_export_tool/
+├── cli.py ← Entry point & argparse
+├── commands/ ← analyze, export command wiring
+└── core/
+ ├── parser.py ← Streaming JSON via ijson
+ ├── filter_pipeline.py ← Field + metadata filtering
+ ├── export_service.py ← Orchestration
+ ├── config/ ← TOML loading, models, validation
+ ├── transcript/ ← Branch reconstruction, text extraction
+ ├── validation/ ← Field & metadata validation
+ └── output/ ← Formatters, writer, path resolution
+```
-- `core/config/` for runtime config models, loading, and validation
-- `core/transcript/` for branch reconstruction and transcript extraction
-- `core/validation/` for field and metadata validation
-- `core/output/` for formatting, naming, path resolution, and writing
+The design is deliberately modular: filtering, formatting, splitting, and writing are separate concerns. Most changes touch one small file, not a central controller.
-That separation is deliberate: most behavior changes can be made in one small subsystem instead of in one large control file.
+---
## Development
-Run the checks used during refactoring:
-
```bash
+# Tests
uv run pytest
uv run pytest --cov=chatgpt_export_tool --cov-report=term-missing
-uv run ruff check chatgpt_export_tool tests pyproject.toml
+
+# Lint & format
+uv run ruff check chatgpt_export_tool tests
uv run ruff format --check chatgpt_export_tool tests
```
-If you need to format files:
-
-```bash
-uv run ruff format chatgpt_export_tool tests
-```
+---
-## Notes
+## License
-- Input handling is streaming, so large exports do not need to be loaded into memory just to analyze or iterate conversations.
-- Single-file JSON export writes one valid JSON document.
-- Split exports write one conversation per output file.
-- Text export follows the active thread path using `current_node` and `parent` links.
-- The field-selection and metadata-selection surface is documented in [Fields.md](Fields.md).
+[MIT](LICENSE) — Void Freud ([@voidfreud](https://github.com/voidfreud))
diff --git a/chatgpt_export.toml.example b/chatgpt_export.toml.example
index 11987dd..287d023 100644
--- a/chatgpt_export.toml.example
+++ b/chatgpt_export.toml.example
@@ -1,5 +1,5 @@
[defaults]
-format = "txt"
+format = "md"
split = "single"
fields = "all"
output_dir = "output"
@@ -32,7 +32,7 @@ header_fields = ["title", "id", "create_time"]
conversation_time_format = "%H:%M %d-%m-%Y"
turn_time_format = "%H:%M %d-%m-%Y"
layout_mode = "reading"
-heading_style = "plain"
+heading_style = "markdown"
include_turn_count_in_header = true
include_turn_numbers = false
turn_separator = "---"
diff --git a/chatgpt_export_tool/commands/export.py b/chatgpt_export_tool/commands/export.py
index 8ed65f5..4fc6db3 100644
--- a/chatgpt_export_tool/commands/export.py
+++ b/chatgpt_export_tool/commands/export.py
@@ -153,7 +153,7 @@ def add_export_parser(
"export",
help="Extract conversations with field selection and metadata filtering",
description=(
- "Export ChatGPT conversations to txt or json format.\n\n"
+ "Export ChatGPT conversations to md, txt, or json format.\n\n"
"Use --fields to control which structural fields are retained,\n"
"and compose --include/--exclude to filter metadata fields.\n"
"Use --split to organize output into directories."
@@ -200,9 +200,9 @@ def add_export_parser(
export_parser.add_argument(
"--format",
"-F",
- choices=["txt", "json"],
+ choices=["md", "txt", "json"],
default=None,
- help="Output format: 'txt' or 'json' (default from config or built-in fallback)",
+ help="Output format: 'md', 'txt', or 'json' (default: md)",
)
export_parser.add_argument(
"--fields",
diff --git a/chatgpt_export_tool/core/config/models.py b/chatgpt_export_tool/core/config/models.py
index 64355f3..fd9db8a 100644
--- a/chatgpt_export_tool/core/config/models.py
+++ b/chatgpt_export_tool/core/config/models.py
@@ -13,7 +13,7 @@
class DefaultsConfig:
"""Default CLI/runtime values."""
- format_type: str = "txt"
+ format_type: str = "md"
split_mode: str = "single"
field_spec: str = "all"
output_dir: str = "output"
diff --git a/chatgpt_export_tool/core/config/validation.py b/chatgpt_export_tool/core/config/validation.py
index de06bce..bc885a5 100644
--- a/chatgpt_export_tool/core/config/validation.py
+++ b/chatgpt_export_tool/core/config/validation.py
@@ -9,7 +9,7 @@
def validate_defaults_config(defaults: DefaultsConfig) -> None:
"""Validate default config values that have constrained semantics."""
- valid_formats = {"txt", "json"}
+ valid_formats = {"md", "txt", "json"}
if defaults.format_type not in valid_formats:
raise ValueError(
"Config value 'format' must be one of: " + ", ".join(sorted(valid_formats))
diff --git a/chatgpt_export_tool/core/export_service.py b/chatgpt_export_tool/core/export_service.py
index 4a400da..1750838 100644
--- a/chatgpt_export_tool/core/export_service.py
+++ b/chatgpt_export_tool/core/export_service.py
@@ -40,7 +40,7 @@ class ExportConfig:
"""
filepath: str
- format_type: str = "txt"
+ format_type: str = "md"
output_file: Optional[str] = None
output_dir: str = "output"
split_mode: SplitMode = SplitMode.SINGLE
diff --git a/chatgpt_export_tool/core/output/formatters.py b/chatgpt_export_tool/core/output/formatters.py
index 2d56540..c1c935c 100644
--- a/chatgpt_export_tool/core/output/formatters.py
+++ b/chatgpt_export_tool/core/output/formatters.py
@@ -73,6 +73,10 @@ def __init__(
indent,
)
+ @property
+ def _is_markdown(self) -> bool:
+ return self.text_output_config.heading_style == "markdown"
+
def format_conversation(self, conv: dict[str, Any]) -> str:
"""Format a conversation as text.
@@ -82,7 +86,9 @@ def format_conversation(self, conv: dict[str, Any]) -> str:
Returns:
Formatted conversation text.
"""
- lines = ["-" * 40]
+ lines: list[str] = []
+ if not self._is_markdown:
+ lines.append("-" * 40)
if self.include_header:
lines.extend(self._render_header(conv))
@@ -104,12 +110,18 @@ def format_conversation(self, conv: dict[str, Any]) -> str:
and self.text_output_config.include_turn_count_in_header
):
header_index = self._find_header_insert_index(lines)
- lines.insert(header_index, f"Turns: {len(turn_blocks)}")
+ turn_label = (
+ f"**Turns:** {len(turn_blocks)}"
+ if self._is_markdown
+ else f"Turns: {len(turn_blocks)}"
+ )
+ lines.insert(header_index, turn_label)
lines.insert(header_index + 1, "")
lines.extend(self._render_chat_entries(turn_blocks))
- lines.append("-" * 40)
+ if not self._is_markdown:
+ lines.append("-" * 40)
return "\n".join(lines)
def _find_header_insert_index(self, lines: list[str]) -> int:
@@ -135,7 +147,10 @@ def _render_context_entries(self, entries: list[TranscriptEntry]) -> list[str]:
for entry in entries:
for line in self._prepare_text(entry.text).splitlines():
if line.strip():
- lines.append(f"{self.indent}{line}")
+ if self._is_markdown:
+ lines.append(f"> {line}")
+ else:
+ lines.append(f"{self.indent}{line}")
return lines
def _group_chat_entries(self, entries: list[TranscriptEntry]) -> list[TurnBlock]:
@@ -212,6 +227,11 @@ def _prepare_text(self, text: str) -> str:
if self.text_output_config.strip_chatgpt_artifacts:
prepared = CHATGPT_ARTIFACT_RE.sub("", prepared)
prepared = re.sub(r"\n{3,}", "\n\n", prepared)
+ if self._is_markdown:
+ prepared = re.sub(
+ r"^[ \t]*[-*_]{3,}[ \t]*$", "", prepared, flags=re.MULTILINE
+ )
+ prepared = re.sub(r"\n{3,}", "\n\n", prepared)
return prepared.strip()
def _render_section_heading(self, title: str) -> str:
@@ -271,7 +291,13 @@ def _render_header(self, conv: dict[str, Any]) -> list[str]:
value = self._get_header_value(conv, field_name)
if value is None:
continue
- lines.append(f"{self._get_header_label(field_name)}: {value}")
+ if self._is_markdown:
+ if field_name == "title":
+ lines.append(f"# {value}")
+ else:
+ lines.append(f"**{self._get_header_label(field_name)}:** {value}")
+ else:
+ lines.append(f"{self._get_header_label(field_name)}: {value}")
return lines
def _get_header_label(self, field_name: str) -> str:
@@ -335,6 +361,7 @@ def format_conversation(self, conv: dict[str, Any]) -> str:
FORMATTERS = {
+ "md": TextFormatter,
"txt": TextFormatter,
"json": JSONFormatter,
}
@@ -360,6 +387,18 @@ def get_formatter(format_type: str, **kwargs: Any) -> BaseFormatter:
formatter_kwargs = kwargs
if format_type == "json":
formatter_kwargs = {}
+ elif format_type == "md":
+ # Override text_output_config to force markdown heading style
+ text_output_config = formatter_kwargs.get("text_output_config")
+ if text_output_config is None:
+ from chatgpt_export_tool.core.config.runtime import TextOutputConfig
+
+ text_output_config = TextOutputConfig(heading_style="markdown")
+ elif text_output_config.heading_style != "markdown":
+ from dataclasses import replace
+
+ text_output_config = replace(text_output_config, heading_style="markdown")
+ formatter_kwargs["text_output_config"] = text_output_config
formatter = FORMATTERS[format_type](**formatter_kwargs)
logger.debug("Created formatter %s", type(formatter).__name__)
return formatter
diff --git a/chatgpt_export_tool/core/output/naming.py b/chatgpt_export_tool/core/output/naming.py
index b02e2bf..91921aa 100644
--- a/chatgpt_export_tool/core/output/naming.py
+++ b/chatgpt_export_tool/core/output/naming.py
@@ -42,7 +42,7 @@ def sanitize(self, title: Optional[str]) -> str:
return sanitized or "untitled"
- def get_filename(self, stem: Any, extension: str = "txt") -> str:
+ def get_filename(self, stem: Any, extension: str = "md") -> str:
"""Build a filename from a stem and extension.
Args:
diff --git a/chatgpt_export_tool/core/output/paths.py b/chatgpt_export_tool/core/output/paths.py
index 1fb5e21..7ad7cf4 100644
--- a/chatgpt_export_tool/core/output/paths.py
+++ b/chatgpt_export_tool/core/output/paths.py
@@ -17,7 +17,7 @@ class OutputPathResolver:
def __init__(
self,
output_dir: str | Path = "output",
- format_type: str = "txt",
+ format_type: str = "md",
split_mode: Optional[SplitMode] = None,
file_namer: Optional[FileNamer] = None,
) -> None:
diff --git a/chatgpt_export_tool/core/output/writer.py b/chatgpt_export_tool/core/output/writer.py
index 22e9832..a924d29 100644
--- a/chatgpt_export_tool/core/output/writer.py
+++ b/chatgpt_export_tool/core/output/writer.py
@@ -58,7 +58,7 @@ class OutputWriter:
def __init__(
self,
output_dir: str = "output",
- format_type: str = "txt",
+ format_type: str = "md",
split_mode: Optional[SplitMode] = None,
) -> None:
"""Initialize an output writer.
diff --git a/tests/test_output_writer.py b/tests/test_output_writer.py
index f296e9c..4498ef5 100644
--- a/tests/test_output_writer.py
+++ b/tests/test_output_writer.py
@@ -97,13 +97,13 @@ def test_get_filename(self):
assert filename == "My_Conversation.txt"
def test_get_filename_default_extension(self):
- """Test get_filename uses 'txt' as default extension."""
+ """Test get_filename uses 'md' as default extension."""
namer = FileNamer()
conv = {"title": "Test"}
filename = namer.get_filename(conv)
- assert filename == "Test.txt"
+ assert filename == "Test.md"
def test_get_filename_falls_back_to_untitled(self):
"""Test get_filename uses 'untitled' when no title."""
@@ -112,7 +112,7 @@ def test_get_filename_falls_back_to_untitled(self):
conv = {}
filename = namer.get_filename(conv)
- assert filename == "untitled.txt"
+ assert filename == "untitled.md"
class TestWriteResult:
@@ -170,7 +170,7 @@ def test_init_default_values(self):
writer = OutputWriter()
assert writer.output_dir == Path("output")
- assert writer.format_type == "txt"
+ assert writer.format_type == "md"
assert writer.split_mode is None
def test_init_with_values(self):
@@ -193,7 +193,7 @@ def test_path_resolution_subject_mode_uses_title_plus_id(self):
path = resolver.get_filepath({"title": "Test", "id": "123"}, "Test_123")
- assert path == Path("/output/Test_123.txt")
+ assert path == Path("/output/Test_123.md")
def test_write_conversations_single_group(self, tmp_path):
"""Test write_jobs with two conversations."""
diff --git a/tests/test_runtime_config.py b/tests/test_runtime_config.py
index 0f0d769..525d9f7 100644
--- a/tests/test_runtime_config.py
+++ b/tests/test_runtime_config.py
@@ -19,7 +19,7 @@ def test_load_runtime_config_defaults_when_no_file(
config = load_runtime_config()
- assert config.defaults.format_type == "txt"
+ assert config.defaults.format_type == "md"
assert config.transcript.show_assistant_thoughts is True
assert config.text_output.header_fields == ("title", "id", "create_time")
assert config.source_path is None
@@ -69,7 +69,7 @@ def test_load_runtime_config_does_not_implicitly_read_cwd_file(
config = load_runtime_config()
- assert config.defaults.format_type == "txt"
+ assert config.defaults.format_type == "md"
assert config.source_path is None
diff --git a/tests/test_runtime_contract.py b/tests/test_runtime_contract.py
index b27238a..8bec08c 100644
--- a/tests/test_runtime_contract.py
+++ b/tests/test_runtime_contract.py
@@ -352,8 +352,8 @@ def test_single_split_without_output_writes_to_stdout(
captured = capsys.readouterr()
assert exit_code == 0
- assert "Title: Alpha" in captured.out
- assert "Title: Beta" in captured.out
+ assert "# Alpha" in captured.out
+ assert "# Beta" in captured.out
assert "Exported 2 files" not in captured.out
assert not output_dir.exists()
@@ -373,7 +373,7 @@ def test_single_split_with_output_writes_one_file(self, tmp_path: Path) -> None:
assert exit_code == 0
assert output_file.exists()
content = output_file.read_text(encoding="utf-8")
- assert "Title: Alpha" in content
+ assert "# Alpha" in content
def test_single_json_output_is_valid_json(self, tmp_path: Path) -> None:
"""Single JSON export writes one valid JSON document."""
@@ -499,7 +499,7 @@ def test_split_subject_naming_uses_stable_source_fields(
assert exit_code == 0
names = sorted(path.name for path in output_dir.iterdir())
- assert names == ["Same_conv-1.txt", "Same_conv-2.txt"]
+ assert names == ["Same_conv-1.md", "Same_conv-2.md"]
def test_split_write_failures_return_non_zero_exit_code(
self, tmp_path: Path