-
Notifications
You must be signed in to change notification settings - Fork 3
Add image handling to document indexing pipeline #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac61d6f
7aa299f
9c3a774
40ff34d
17d6bf7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| name: Render Promo Video | ||
|
|
||
| on: | ||
| push: | ||
| branches: [main, feat/video, feat/image-handling] | ||
| paths: | ||
| - "video/**" | ||
| pull_request: | ||
| branches: [main] | ||
| paths: | ||
| - "video/**" | ||
| workflow_dispatch: | ||
| inputs: | ||
| codec: | ||
| description: "Output codec" | ||
| required: false | ||
| default: "h264" | ||
| type: choice | ||
| options: | ||
| - h264 | ||
| - h265 | ||
| - vp8 | ||
| - vp9 | ||
| crf: | ||
| description: "Constant Rate Factor (quality, lower = better)" | ||
| required: false | ||
| default: "18" | ||
|
|
||
| jobs: | ||
| render: | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 20 | ||
|
|
||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| - uses: actions/setup-node@v4 | ||
| with: | ||
| node-version: 20 | ||
| cache: npm | ||
| cache-dependency-path: video/package-lock.json | ||
|
|
||
| - name: Install system dependencies | ||
| run: | | ||
| sudo apt-get update | ||
| sudo apt-get install -y \ | ||
| chromium-browser \ | ||
| libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 \ | ||
| libxcomposite1 libxdamage1 libxrandr2 libgbm1 \ | ||
| libpango-1.0-0 libcairo2 libxshmfence1 \ | ||
| fonts-noto fonts-liberation ffmpeg | ||
| sudo apt-get install -y libasound2t64 || sudo apt-get install -y libasound2 | ||
| echo "BROWSER_PATH=$(which chromium-browser || which chromium || which google-chrome-stable || which google-chrome)" >> $GITHUB_ENV | ||
|
|
||
| - name: Install dependencies | ||
| working-directory: video | ||
| run: npm ci | ||
|
|
||
| - name: Cache Remotion bundle | ||
| uses: actions/cache@v4 | ||
| with: | ||
| path: video/node_modules/.cache/remotion | ||
| key: remotion-bundle-${{ hashFiles('video/src/**') }} | ||
| restore-keys: remotion-bundle- | ||
|
|
||
| - name: Render video | ||
| working-directory: video | ||
| run: | | ||
| npx remotion render src/index.ts TreeDexVideo out/treedex.mp4 \ | ||
| --browser-executable="$BROWSER_PATH" \ | ||
| --codec=${{ github.event.inputs.codec || 'h264' }} \ | ||
| --crf=${{ github.event.inputs.crf || '18' }} \ | ||
| --concurrency=4 \ | ||
| --image-format=jpeg \ | ||
| --quality=85 \ | ||
| --bundle-cache=true \ | ||
| --log=verbose | ||
|
|
||
| - name: Upload rendered video | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: treedex-promo-video | ||
| path: video/out/treedex.mp4 | ||
| retention-days: 30 | ||
|
|
||
| - name: Print video info | ||
| working-directory: video | ||
| run: | | ||
| ls -lh out/treedex.mp4 | ||
| ffprobe -v quiet -print_format json -show_format -show_streams out/treedex.mp4 | head -50 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,22 @@ | |
| export abstract class BaseLLM { | ||
| abstract generate(prompt: string): Promise<string>; | ||
|
|
||
| /** Whether this backend supports image inputs. */ | ||
| get supportsVision(): boolean { | ||
| return false; | ||
| } | ||
|
|
||
| /** Send a prompt with an image and return the generated text. */ | ||
| async generateWithImage( | ||
| _prompt: string, | ||
| _imageBase64: string, | ||
| _mimeType: string, | ||
| ): Promise<string> { | ||
| throw new Error( | ||
| `${this.constructor.name} does not support vision/image inputs.`, | ||
| ); | ||
| } | ||
|
|
||
| toString(): string { | ||
| return `${this.constructor.name}()`; | ||
| } | ||
|
|
@@ -65,11 +81,28 @@ export class GeminiLLM extends BaseLLM { | |
| } | ||
|
|
||
| async generate(prompt: string): Promise<string> { | ||
| const model = await this.getClient() as { generateContent(p: string): Promise<{ response: { text(): string } }> }; | ||
| const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> }; | ||
| const response = await model.generateContent(prompt); | ||
| return response.response.text(); | ||
| } | ||
|
|
||
| get supportsVision(): boolean { | ||
| return true; | ||
| } | ||
|
Comment on lines
+89
to
+91
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's examine the src/llm-backends.ts file to understand the structure and find the relevant lines
wc -l src/llm-backends.tsRepository: mithun50/TreeDex Length of output: 83 🏁 Script executed: # Read the entire file to understand the implementation
cat -n src/llm-backends.tsRepository: mithun50/TreeDex Length of output: 25884 🏁 Script executed: # Also check src/core.ts to understand how supportsVision is used
cat -n src/core.tsRepository: mithun50/TreeDex Length of output: 13819
These getters always return Also applies to: 149-151, 225-227 🤖 Prompt for AI Agents |
||
|
|
||
| async generateWithImage( | ||
| prompt: string, | ||
| imageBase64: string, | ||
| mimeType: string, | ||
| ): Promise<string> { | ||
| const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> }; | ||
| const imagePart = { | ||
| inlineData: { mimeType, data: imageBase64 }, | ||
| }; | ||
| const response = await model.generateContent([prompt, imagePart]); | ||
| return response.response.text(); | ||
| } | ||
|
|
||
| toString(): string { | ||
| return `GeminiLLM(model=${JSON.stringify(this.modelName)})`; | ||
| } | ||
|
|
@@ -113,6 +146,40 @@ export class OpenAILLM extends BaseLLM { | |
| return response.choices[0].message.content; | ||
| } | ||
|
|
||
| get supportsVision(): boolean { | ||
| return true; | ||
| } | ||
|
|
||
| async generateWithImage( | ||
| prompt: string, | ||
| imageBase64: string, | ||
| mimeType: string, | ||
| ): Promise<string> { | ||
| const client = await this.getClient() as { | ||
| chat: { | ||
| completions: { | ||
| create(opts: unknown): Promise<{ | ||
| choices: Array<{ message: { content: string } }>; | ||
| }>; | ||
| }; | ||
| }; | ||
| }; | ||
| const response = await client.chat.completions.create({ | ||
| model: this.modelName, | ||
| messages: [{ | ||
| role: "user", | ||
| content: [ | ||
| { type: "text", text: prompt }, | ||
| { | ||
| type: "image_url", | ||
| image_url: { url: `data:${mimeType};base64,${imageBase64}` }, | ||
| }, | ||
| ], | ||
| }], | ||
| }); | ||
| return response.choices[0].message.content; | ||
| } | ||
|
|
||
| toString(): string { | ||
| return `OpenAILLM(model=${JSON.stringify(this.modelName)})`; | ||
| } | ||
|
|
@@ -155,6 +222,43 @@ export class ClaudeLLM extends BaseLLM { | |
| return response.content[0].text; | ||
| } | ||
|
|
||
| get supportsVision(): boolean { | ||
| return true; | ||
| } | ||
|
|
||
| async generateWithImage( | ||
| prompt: string, | ||
| imageBase64: string, | ||
| mimeType: string, | ||
| ): Promise<string> { | ||
| const client = await this.getClient() as { | ||
| messages: { | ||
| create(opts: unknown): Promise<{ | ||
| content: Array<{ text: string }>; | ||
| }>; | ||
| }; | ||
| }; | ||
| const response = await client.messages.create({ | ||
| model: this.modelName, | ||
| max_tokens: 4096, | ||
| messages: [{ | ||
| role: "user", | ||
| content: [ | ||
| { | ||
| type: "image", | ||
| source: { | ||
| type: "base64", | ||
| media_type: mimeType, | ||
| data: imageBase64, | ||
| }, | ||
| }, | ||
| { type: "text", text: prompt }, | ||
| ], | ||
| }], | ||
| }); | ||
| return response.content[0].text; | ||
| } | ||
|
|
||
| toString(): string { | ||
| return `ClaudeLLM(model=${JSON.stringify(this.modelName)})`; | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P2: Validate
BROWSER_PATHbefore exporting it; otherwise the render step can fail with an empty browser executable path.Prompt for AI agents