Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions .github/workflows/render-video.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: Render Promo Video

on:
push:
branches: [main, feat/video, feat/image-handling]
paths:
- "video/**"
pull_request:
branches: [main]
paths:
- "video/**"
workflow_dispatch:
inputs:
codec:
description: "Output codec"
required: false
default: "h264"
type: choice
options:
- h264
- h265
- vp8
- vp9
crf:
description: "Constant Rate Factor (quality, lower = better)"
required: false
default: "18"

jobs:
render:
runs-on: ubuntu-latest
timeout-minutes: 20

steps:
- uses: actions/checkout@v4

- uses: actions/setup-node@v4
with:
node-version: 20
cache: npm
cache-dependency-path: video/package-lock.json

- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
chromium-browser \
libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 \
libxcomposite1 libxdamage1 libxrandr2 libgbm1 \
libpango-1.0-0 libcairo2 libxshmfence1 \
fonts-noto fonts-liberation ffmpeg
sudo apt-get install -y libasound2t64 || sudo apt-get install -y libasound2
echo "BROWSER_PATH=$(which chromium-browser || which chromium || which google-chrome-stable || which google-chrome)" >> $GITHUB_ENV
Copy link
Copy Markdown

@cubic-dev-ai cubic-dev-ai Bot Mar 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Validate BROWSER_PATH before exporting it; otherwise the render step can fail with an empty browser executable path.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At .github/workflows/render-video.yml, line 55:

<comment>Validate `BROWSER_PATH` before exporting it; otherwise the render step can fail with an empty browser executable path.</comment>

<file context>
@@ -0,0 +1,85 @@
+          # libasound2 renamed to libasound2t64 on Ubuntu 24.04+
+          sudo apt-get install -y libasound2t64 || sudo apt-get install -y libasound2
+          # Find Chrome/Chromium path for Remotion
+          echo "BROWSER_PATH=$(which chromium-browser || which chromium || which google-chrome-stable || which google-chrome)" >> $GITHUB_ENV
+
+      - name: Install dependencies
</file context>
Fix with Cubic


- name: Install dependencies
working-directory: video
run: npm ci

- name: Cache Remotion bundle
uses: actions/cache@v4
with:
path: video/node_modules/.cache/remotion
key: remotion-bundle-${{ hashFiles('video/src/**') }}
restore-keys: remotion-bundle-

- name: Render video
working-directory: video
run: |
npx remotion render src/index.ts TreeDexVideo out/treedex.mp4 \
--browser-executable="$BROWSER_PATH" \
--codec=${{ github.event.inputs.codec || 'h264' }} \
--crf=${{ github.event.inputs.crf || '18' }} \
--concurrency=4 \
--image-format=jpeg \
--quality=85 \
--bundle-cache=true \
--log=verbose

- name: Upload rendered video
uses: actions/upload-artifact@v4
with:
name: treedex-promo-video
path: video/out/treedex.mp4
retention-days: 30

- name: Print video info
working-directory: video
run: |
ls -lh out/treedex.mp4
ffprobe -v quiet -print_format json -show_format -show_streams out/treedex.mp4 | head -50
55 changes: 53 additions & 2 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,53 @@ import {
structureContinuePrompt,
retrievalPrompt,
answerPrompt,
imageDescriptionPrompt,
} from "./prompts.js";
import { countTokens } from "./pdf-parser.js";
import type { Page, TreeNode, IndexData, Stats } from "./types.js";
import type { BaseLLM } from "./llm-backends.js";

/** Append image descriptions to page text, modifying pages in place. */
async function describeImages(
pages: Page[],
llm?: BaseLLM | null,
verbose: boolean = false,
): Promise<void> {
for (const page of pages) {
if (!page.images || page.images.length === 0) continue;

const descriptions: string[] = [];
for (const img of page.images) {
const alt = (img.alt_text ?? "").trim();
if (alt) {
descriptions.push(`[Image: ${alt}]`);
} else if (llm?.supportsVision && img.data) {
try {
const desc = await llm.generateWithImage(
imageDescriptionPrompt(),
img.data,
img.mime_type,
);
descriptions.push(`[Image: ${desc.trim()}]`);
} catch {
descriptions.push("[Image present]");
}
} else {
descriptions.push("[Image present]");
}
}

if (descriptions.length > 0) {
page.text = page.text + "\n" + descriptions.join("\n");
page.token_count = countTokens(page.text);
}

if (verbose && descriptions.length > 0) {
console.log(` Page ${page.page_num}: ${descriptions.length} image(s) described`);
}
}
}

/** Result of a TreeDex query. */
export class QueryResult {
readonly context: string;
Expand Down Expand Up @@ -100,13 +143,15 @@ export class TreeDex {
maxTokens?: number;
overlap?: number;
verbose?: boolean;
extractImages?: boolean;
},
): Promise<TreeDex> {
const {
loader,
maxTokens = 20000,
overlap = 1,
verbose = true,
extractImages = false,
} = options ?? {};

if (verbose) {
Expand All @@ -118,7 +163,7 @@ export class TreeDex {
if (loader) {
pages = await loader.load(path);
} else {
pages = await autoLoader(path);
pages = await autoLoader(path, { extractImages });
}

if (verbose) {
Expand All @@ -141,6 +186,9 @@ export class TreeDex {
): Promise<TreeDex> {
const { maxTokens = 20000, overlap = 1, verbose = true } = options ?? {};

// Describe images before grouping — appends text markers to pages
await describeImages(pages, llm, verbose);

const groups = groupPages(pages, maxTokens, overlap);

if (verbose) {
Expand Down Expand Up @@ -288,11 +336,14 @@ export class TreeDex {
const fs = await import("node:fs/promises");
const stripped = stripTextFromTree(this.tree);

// Strip images from pages — descriptions are already in text
const cleanPages: Page[] = this.pages.map(({ images: _images, ...rest }) => rest);

const data: IndexData = {
version: "1.0",
framework: "TreeDex",
tree: stripped,
pages: this.pages,
pages: cleanPages,
};

await fs.writeFile(path, JSON.stringify(data, null, 2), "utf-8");
Expand Down
3 changes: 2 additions & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,6 @@ export {
structureContinuePrompt,
retrievalPrompt,
answerPrompt,
imageDescriptionPrompt,
} from "./prompts.js";
export type { Page, TreeNode, IndexData, Stats } from "./types.js";
export type { Page, PageImage, TreeNode, IndexData, Stats } from "./types.js";
106 changes: 105 additions & 1 deletion src/llm-backends.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,22 @@
export abstract class BaseLLM {
abstract generate(prompt: string): Promise<string>;

/** Whether this backend supports image inputs. */
get supportsVision(): boolean {
return false;
}

/** Send a prompt with an image and return the generated text. */
async generateWithImage(
_prompt: string,
_imageBase64: string,
_mimeType: string,
): Promise<string> {
throw new Error(
`${this.constructor.name} does not support vision/image inputs.`,
);
}

toString(): string {
return `${this.constructor.name}()`;
}
Expand Down Expand Up @@ -65,11 +81,28 @@ export class GeminiLLM extends BaseLLM {
}

async generate(prompt: string): Promise<string> {
const model = await this.getClient() as { generateContent(p: string): Promise<{ response: { text(): string } }> };
const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> };
const response = await model.generateContent(prompt);
return response.response.text();
}

get supportsVision(): boolean {
return true;
}
Comment on lines +89 to +91
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's examine the src/llm-backends.ts file to understand the structure and find the relevant lines
wc -l src/llm-backends.ts

Repository: mithun50/TreeDex

Length of output: 83


🏁 Script executed:

# Read the entire file to understand the implementation
cat -n src/llm-backends.ts

Repository: mithun50/TreeDex

Length of output: 25884


🏁 Script executed:

# Also check src/core.ts to understand how supportsVision is used
cat -n src/core.ts

Repository: mithun50/TreeDex

Length of output: 13819


supportsVision should be model-aware, not hard-coded by backend class.

These getters always return true, even when callers override modelName with a text-only model. While generateWithImage() errors are caught with graceful fallback in src/core.ts (line 55), this causes unnecessary API calls and errors for text-only model configurations. Please make this configurable per instance or derive it from the chosen model to avoid wasted requests.

Also applies to: 149-151, 225-227

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/llm-backends.ts` around lines 89 - 91, The supportsVision getter is
hard-coded to return true causing wasted image requests; change it to be
model-aware by making supportsVision read from instance state (e.g.,
this.modelName or this.opts.supportsVision) or derive it from a helper
isVisionModel(modelName). Update the getters at the three locations (the get
supportsVision() implementations around lines 89-91, 149-151, 225-227) to return
a boolean based on either an explicit constructor option (supportsVision) or on
a small isVisionModel(this.modelName) lookup, and add that option/lookup where
the backend classes are constructed so callers can override or the backend can
compute capability from the model name.


async generateWithImage(
prompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> };
const imagePart = {
inlineData: { mimeType, data: imageBase64 },
};
const response = await model.generateContent([prompt, imagePart]);
return response.response.text();
}

toString(): string {
return `GeminiLLM(model=${JSON.stringify(this.modelName)})`;
}
Expand Down Expand Up @@ -113,6 +146,40 @@ export class OpenAILLM extends BaseLLM {
return response.choices[0].message.content;
}

get supportsVision(): boolean {
return true;
}

async generateWithImage(
prompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const client = await this.getClient() as {
chat: {
completions: {
create(opts: unknown): Promise<{
choices: Array<{ message: { content: string } }>;
}>;
};
};
};
const response = await client.chat.completions.create({
model: this.modelName,
messages: [{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "image_url",
image_url: { url: `data:${mimeType};base64,${imageBase64}` },
},
],
}],
});
return response.choices[0].message.content;
}

toString(): string {
return `OpenAILLM(model=${JSON.stringify(this.modelName)})`;
}
Expand Down Expand Up @@ -155,6 +222,43 @@ export class ClaudeLLM extends BaseLLM {
return response.content[0].text;
}

get supportsVision(): boolean {
return true;
}

async generateWithImage(
prompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const client = await this.getClient() as {
messages: {
create(opts: unknown): Promise<{
content: Array<{ text: string }>;
}>;
};
};
const response = await client.messages.create({
model: this.modelName,
max_tokens: 4096,
messages: [{
role: "user",
content: [
{
type: "image",
source: {
type: "base64",
media_type: mimeType,
data: imageBase64,
},
},
{ type: "text", text: prompt },
],
}],
});
return response.content[0].text;
}

toString(): string {
return `ClaudeLLM(model=${JSON.stringify(this.modelName)})`;
}
Expand Down
Loading
Loading