diff --git a/CHROME_WEB_STORE.md b/CHROME_WEB_STORE.md new file mode 100644 index 0000000..c8e217d --- /dev/null +++ b/CHROME_WEB_STORE.md @@ -0,0 +1,113 @@ +# Chrome Web Store Listing + +## Extension Name + +Scrape GitHub by Olostep + +## Short Description + +Extract GitHub data as JSON or CSV, then discover Olostep's Web Data API for scalable scraping and AI workflows. + +## Full Description + +Scrape GitHub by Olostep is a lightweight Chrome extension that turns GitHub repository pages and personal user profiles into structured JSON or CSV directly from your browser. + +Use it to quickly extract GitHub data such as repository names, descriptions, stars, forks, watchers, topics, programming languages, licenses, README summaries, profile bios, follower counts, social links, and pinned repositories. + +This extension is designed as a simple example of structured web data extraction. For developers and teams that need to scrape websites at scale, crawl pages, extract clean Markdown, HTML, PDFs, or structured JSON, and power AI agents with reliable web data, Olostep provides a full Web Data API. + +Olostep helps you search, extract, and structure web data from any website through developer-friendly APIs for scrapes, crawls, answers, parsers, batches, and browser automations. + +Key features: + +- Scrape GitHub repository pages into structured JSON +- Export GitHub data as CSV +- Parse GitHub user profiles and pinned repositories +- Copy or download extracted data from the browser +- Run local parsing from the active GitHub tab +- Discover Olostep for scalable web scraping, crawling, and AI data workflows + +Scrape GitHub is useful for open-source research, developer sourcing, repository analysis, technical research, competitive intelligence, and structured GitHub data collection. + +Supported pages currently include GitHub repository root pages and personal user profile root pages. + +When you are ready to move beyond a browser extension, visit https://olostep.com to build scalable web data pipelines with Olostep's Web Data API for AI. + +## Single Purpose Statement + +Scrape GitHub by Olostep extracts structured JSON or CSV from the active supported GitHub repository or personal profile page when the user clicks the extension popup. + +## Category + +Developer Tools + +## Language + +English + +## Website URL + +https://olostep.com + +## Support URL + +https://olostep.com + +## Privacy Policy URL + +https://www.olostep.com/privacy-policy + +## Permission Justifications + +### activeTab + +Required so the extension can access the currently active tab only after the user opens the extension and requests parsing. The extension uses this access to read the supported GitHub page the user wants to parse. + +### scripting + +Required to run a small script in the active tab that reads the page URL and HTML. This lets the extension parse the current GitHub repository or profile page locally in the popup. + +### Host permission: https://github.com/* + +Required to limit extension parsing to GitHub pages. The extension supports GitHub repository root pages and personal user profile root pages. + +## Privacy Practices Answers + +### Does the extension collect user data? + +No. The extension reads the active GitHub page HTML and URL locally only when the user clicks "Parse current page". It does not store, transmit, sell, or share parsed data. + +### Does the extension use remote code? + +No. All extension code is included in the extension package. + +### Does the extension use analytics or tracking? + +No. The extension does not include analytics, advertising SDKs, telemetry, tracking pixels, or behavioral tracking. + +### Does the extension transfer data to Olostep or third parties? + +No. Parsed GitHub page content, JSON, CSV, and URLs are not sent to Olostep or third parties. The popup includes a link to Olostep's website, and clicking it navigates the browser to https://olostep.com. + +## Store Assets + +Prepared assets: + +- `store-assets/screenshot-1280x800.png` +- `store-assets/small-promo-440x280.png` +- `extension/icons/icon-128.png` + +## Manual QA Checklist + +- Load the `extension/` folder as an unpacked extension in Chrome. +- Open a GitHub repository root page and confirm parsing succeeds. +- Open a GitHub personal profile root page and confirm parsing succeeds. +- Open an unsupported GitHub subpage and confirm parsing is disabled. +- Open a non-GitHub page and confirm parsing is disabled. +- Copy JSON and CSV results. +- Download JSON and CSV results. +- Click the Olostep links and confirm they open https://olostep.com. + +## Upload Notes + +Upload `dist/scrape-github-by-olostep.zip` to the Chrome Web Store dashboard. diff --git a/docs/assets/chrome-extension-parse.gif b/docs/assets/chrome-extension-parse.gif index a2ef702..94f0af5 100644 Binary files a/docs/assets/chrome-extension-parse.gif and b/docs/assets/chrome-extension-parse.gif differ diff --git a/extension/github-parsers.js b/extension/github-parsers.js index 2bf2176..d168dbd 100644 --- a/extension/github-parsers.js +++ b/extension/github-parsers.js @@ -15,7 +15,9 @@ function getMeta(doc, selector) { function parseCount(value) { if (!value) return null; const normalized = value.replace(/,/g, "").trim().toLowerCase(); - const match = normalized.match(/^([\d.]+)([km])?$/); + const match = + normalized.match(/^([\d.]+)\s*([km])?$/) || + normalized.match(/([\d.]+)\s*([km])?(?=\s*(stars?|forks?|watch(?:ing|ers)?|followers?|following|$))/); if (!match) return null; const number = Number(match[1]); if (Number.isNaN(number)) return null; @@ -83,8 +85,8 @@ function parseRepository(htmlString, pageUrl) { return href.includes("/LICENSE") || /license/i.test(getText(node) || ""); }); const readmeSummary = getText( - doc.querySelector("article.markdown-body p, article.markdown-body h1, article.markdown-body h2") - ); + doc.querySelector("#readme article.markdown-body p, #readme article.markdown-body h1, #readme article.markdown-body h2, article.markdown-body p, article.markdown-body h1, article.markdown-body h2, .js-snippet-clipboard-copy-unpositioned pre") + ) || getText(doc.querySelector("#readme .markdown-body, #readme .Box-body, #readme, article.markdown-body, .markdown-body, .js-snippet-clipboard-copy-unpositioned")); return { success: true, diff --git a/extension/icons/icon-128.png b/extension/icons/icon-128.png new file mode 100644 index 0000000..e03aa5f Binary files /dev/null and b/extension/icons/icon-128.png differ diff --git a/extension/icons/icon-16.png b/extension/icons/icon-16.png new file mode 100644 index 0000000..765e684 Binary files /dev/null and b/extension/icons/icon-16.png differ diff --git a/extension/icons/icon-32.png b/extension/icons/icon-32.png new file mode 100644 index 0000000..7b969d2 Binary files /dev/null and b/extension/icons/icon-32.png differ diff --git a/extension/icons/icon-48.png b/extension/icons/icon-48.png new file mode 100644 index 0000000..f99f2ab Binary files /dev/null and b/extension/icons/icon-48.png differ diff --git a/extension/manifest.json b/extension/manifest.json index b814e3e..3aeec0b 100644 --- a/extension/manifest.json +++ b/extension/manifest.json @@ -1,8 +1,14 @@ { "manifest_version": 3, - "name": "Scrape GitHub", + "name": "Scrape GitHub by Olostep", "version": "0.1.0", - "description": "Parse GitHub repository and user profile pages into structured JSON.", + "description": "Extract GitHub repo and profile data as JSON/CSV, then discover Olostep's Web Data API for AI.", + "icons": { + "16": "icons/icon-16.png", + "32": "icons/icon-32.png", + "48": "icons/icon-48.png", + "128": "icons/icon-128.png" + }, "permissions": [ "activeTab", "scripting" @@ -11,8 +17,13 @@ "https://github.com/*" ], "action": { - "default_title": "Scrape GitHub", + "default_title": "Scrape GitHub by Olostep", "default_popup": "popup.html", - "default_icon": "olostep-icon.png" + "default_icon": { + "16": "icons/icon-16.png", + "32": "icons/icon-32.png", + "48": "icons/icon-48.png", + "128": "icons/icon-128.png" + } } } diff --git a/extension/popup.css b/extension/popup.css index b7207cb..af00b35 100644 --- a/extension/popup.css +++ b/extension/popup.css @@ -66,6 +66,7 @@ body { min-height: 36px; padding: 0 12px; text-decoration: none; + white-space: nowrap; } .panel { @@ -121,8 +122,63 @@ body { color: var(--muted); } -.actions { +.toolbar { display: flex; + justify-content: flex-end; + margin-bottom: 12px; +} + +.segmented { + display: inline-flex; + align-items: center; + gap: 6px; + background: var(--panel); + border: 1px solid var(--line); + border-radius: 999px; + padding: 6px; +} + +.segmented-label { + color: var(--muted); + font-size: 12px; + font-weight: 600; + padding: 0 8px 0 10px; +} + +.segmented-option { + border: 1px solid transparent; + background: transparent; + color: var(--ink); + border-radius: 999px; + cursor: pointer; + font: inherit; + font-size: 12px; + letter-spacing: 0.01em; + min-height: 28px; + padding: 0 10px; + transition: background-color 140ms ease, opacity 140ms ease; +} + +.segmented-option:hover:enabled { + background: rgba(108, 99, 255, 0.08); + transform: none; +} + +.segmented-option:disabled { + cursor: not-allowed; + opacity: 0.55; +} + +.segmented-option.is-active { + background: var(--accent-soft); + color: var(--accent-strong); + border-color: rgba(108, 99, 255, 0.18); + font-weight: 700; +} + +.actions { + display: grid; + grid-template-columns: 1fr 1fr; gap: 10px; } @@ -170,3 +226,15 @@ button:disabled { white-space: pre-wrap; word-break: break-word; } + +.footer-link { + margin: 12px 0 0; + text-align: center; +} + +.footer-link a { + color: var(--accent-strong); + font-size: 12px; + font-weight: 600; + text-decoration: none; +} diff --git a/extension/popup.html b/extension/popup.html index 518f608..f5541de 100644 --- a/extension/popup.html +++ b/extension/popup.html @@ -3,22 +3,22 @@ - Scrape GitHub + Scrape GitHub by Olostep
-

olostep

+

by Olostep

Scrape GitHub

- API + Web Data API

GitHub parser

-

Parse repository roots and personal profile roots into structured JSON with a local extension UI that mirrors the Olostep workflow.

+

Extract GitHub repository and profile data locally. Need reliable web data at scale? Use Olostep's Web Data API for scrapes, crawls, parsers, and AI workflows.

@@ -29,13 +29,26 @@

Scrape GitHub

Open a GitHub repository root or user profile root.

+
+
+ + +
+
+
+ +
No parsed result yet.
+ +
diff --git a/extension/popup.js b/extension/popup.js index 4641d94..7f4331b 100644 --- a/extension/popup.js +++ b/extension/popup.js @@ -2,6 +2,10 @@ import { detectGitHubPageType, parseGitHubPage } from "./github-parsers.js"; const parseButton = document.getElementById("parse-button"); const copyButton = document.getElementById("copy-button"); +const downloadJsonButton = document.getElementById("download-json-button"); +const downloadCsvButton = document.getElementById("download-csv-button"); +const formatJsonButton = document.getElementById("format-json"); +const formatCsvButton = document.getElementById("format-csv"); const output = document.getElementById("output"); const statusBadge = document.getElementById("status-badge"); const statusText = document.getElementById("status-text"); @@ -9,6 +13,11 @@ const statusText = document.getElementById("status-text"); let lastResult = null; let currentTab = null; let currentPageType = null; +let outputFormat = "json"; + +function getOutputFormat() { + return outputFormat === "csv" ? "csv" : "json"; +} function setStatus(label, message) { statusBadge.textContent = label; @@ -19,6 +28,97 @@ function setOutput(value) { output.textContent = value; } +function escapeCsvCell(value) { + const text = String(value ?? ""); + if (/[",\n\r]/.test(text)) { + return `"${text.replace(/"/g, '""')}"`; + } + return text; +} + +function toCsvRow(values) { + return values.map(escapeCsvCell).join(","); +} + +function toCsv(result) { + if (!result || typeof result !== "object") { + return ""; + } + + const keys = Object.keys(result); + const header = toCsvRow(keys); + const row = toCsvRow( + keys.map((key) => { + const value = result[key]; + if (value === null || value === undefined) return ""; + if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") { + return value; + } + return JSON.stringify(value); + }) + ); + + return `${header}\n${row}\n`; +} + +function getOutputText(format) { + if (!lastResult) { + return "No parsed result yet."; + } + if (format === "csv") { + return toCsv(lastResult); + } + return JSON.stringify(lastResult, null, 2); +} + +function getBaseFilename(result) { + if (!result || !result.success) { + return "github-parse"; + } + if (result.type === "repository" && result.fullName) { + return `github-repo-${result.fullName.replace("/", "__")}`; + } + if (result.type === "user_profile" && result.username) { + return `github-user-${result.username}`; + } + return `github-${result.type || "parse"}`; +} + +function downloadTextFile({ text, filename, mimeType }) { + const blob = new Blob([text], { type: mimeType }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = filename; + document.body.appendChild(link); + link.click(); + link.remove(); + URL.revokeObjectURL(url); +} + +function setOutputFormat(nextFormat) { + outputFormat = nextFormat === "csv" ? "csv" : "json"; + formatJsonButton?.classList.toggle("is-active", outputFormat === "json"); + formatCsvButton?.classList.toggle("is-active", outputFormat === "csv"); + updateUi(); +} + +function updateUi() { + const isSupported = Boolean(currentPageType); + const hasResult = Boolean(lastResult); + const format = getOutputFormat(); + + parseButton.disabled = !isSupported; + formatJsonButton.disabled = !isSupported; + formatCsvButton.disabled = !isSupported; + copyButton.disabled = !isSupported || !hasResult; + downloadJsonButton.disabled = !isSupported || !hasResult; + downloadCsvButton.disabled = !isSupported || !hasResult; + + copyButton.textContent = format === "csv" ? "Copy CSV" : "Copy JSON"; + setOutput(getOutputText(format)); +} + async function getActiveTab() { const [tab] = await chrome.tabs.query({ active: true, currentWindow: true }); return tab || null; @@ -42,19 +142,17 @@ async function refreshPageSupport() { currentPageType = detectGitHubPageType(url); if (!currentPageType) { - parseButton.disabled = true; - copyButton.disabled = true; setStatus("Unsupported", "Supported pages are GitHub repository roots and personal profile roots only."); - setOutput("No parsed result yet."); + lastResult = null; + updateUi(); return; } - parseButton.disabled = false; - copyButton.disabled = !lastResult; setStatus( currentPageType === "repository" ? "Repository" : "User profile", "This page is supported. Run the local parser to generate structured JSON." ); + updateUi(); } parseButton.addEventListener("click", async () => { @@ -64,6 +162,8 @@ parseButton.addEventListener("click", async () => { parseButton.disabled = true; copyButton.disabled = true; + downloadJsonButton.disabled = true; + downloadCsvButton.disabled = true; setStatus("Parsing", "Reading the current page and running the parser locally."); setOutput("Parsing..."); @@ -71,11 +171,10 @@ parseButton.addEventListener("click", async () => { const { html, url } = await readActiveTabHtml(currentTab.id); const result = parseGitHubPage(currentPageType, html, url); lastResult = result; - setOutput(JSON.stringify(result, null, 2)); + updateUi(); if (result.success) { setStatus("Success", "Structured JSON generated for the current page."); - copyButton.disabled = false; } else { setStatus("Parser error", result.error || "Unable to parse this page."); } @@ -84,7 +183,7 @@ parseButton.addEventListener("click", async () => { setStatus("Runtime error", error.message); setOutput(error.stack || error.message); } finally { - parseButton.disabled = false; + updateUi(); } }); @@ -93,8 +192,39 @@ copyButton.addEventListener("click", async () => { return; } - await navigator.clipboard.writeText(JSON.stringify(lastResult, null, 2)); - setStatus("Copied", "JSON copied to the clipboard."); + const format = getOutputFormat(); + await navigator.clipboard.writeText(getOutputText(format)); + setStatus("Copied", format === "csv" ? "CSV copied to the clipboard." : "JSON copied to the clipboard."); +}); + +downloadJsonButton.addEventListener("click", () => { + if (!lastResult) return; + const base = getBaseFilename(lastResult); + downloadTextFile({ + text: JSON.stringify(lastResult, null, 2), + filename: `${base}.json`, + mimeType: "application/json" + }); + setStatus("Downloaded", "JSON saved to your downloads."); +}); + +downloadCsvButton.addEventListener("click", () => { + if (!lastResult) return; + const base = getBaseFilename(lastResult); + downloadTextFile({ + text: toCsv(lastResult), + filename: `${base}.csv`, + mimeType: "text/csv" + }); + setStatus("Downloaded", "CSV saved to your downloads."); +}); + +formatJsonButton.addEventListener("click", () => { + setOutputFormat("json"); +}); + +formatCsvButton.addEventListener("click", () => { + setOutputFormat("csv"); }); refreshPageSupport(); diff --git a/parsers/github-repository.parser.js b/parsers/github-repository.parser.js index f2b4a46..e82a6b6 100644 --- a/parsers/github-repository.parser.js +++ b/parsers/github-repository.parser.js @@ -11,7 +11,9 @@ function getAttr(node, attr) { function parseCount(value) { if (!value) return null; const normalized = value.replace(/,/g, "").trim().toLowerCase(); - const match = normalized.match(/^([\d.]+)([km])?$/); + const match = + normalized.match(/^([\d.]+)\s*([km])?$/) || + normalized.match(/([\d.]+)\s*([km])?(?=\s*(stars?|forks?|watch(?:ing|ers)?|followers?|following|$))/); if (!match) return null; const number = Number(match[1]); if (Number.isNaN(number)) return null; @@ -82,8 +84,8 @@ function parseRepository(htmlString, pageUrl) { .map((node) => getText(node)) ); const readmeSummary = getText( - doc.querySelector("article.markdown-body p, article.markdown-body h1, article.markdown-body h2") - ); + doc.querySelector("#readme article.markdown-body p, #readme article.markdown-body h1, #readme article.markdown-body h2, article.markdown-body p, article.markdown-body h1, article.markdown-body h2, .js-snippet-clipboard-copy-unpositioned pre") + ) || getText(doc.querySelector("#readme .markdown-body, #readme .Box-body, #readme, article.markdown-body, .markdown-body, .js-snippet-clipboard-copy-unpositioned")); const socialCounts = Array.from( doc.querySelectorAll('a[href$="/stargazers"], a[href$="/forks"], a[href$="/watchers"]') diff --git a/parsers/github-user-profile.parser.js b/parsers/github-user-profile.parser.js index d3b2307..caf1053 100644 --- a/parsers/github-user-profile.parser.js +++ b/parsers/github-user-profile.parser.js @@ -11,7 +11,9 @@ function getAttr(node, attr) { function parseCount(value) { if (!value) return null; const normalized = value.replace(/,/g, "").trim().toLowerCase(); - const match = normalized.match(/^([\d.]+)([km])?$/); + const match = + normalized.match(/^([\d.]+)\s*([km])?$/) || + normalized.match(/([\d.]+)\s*([km])?(?=\s*(stars?|forks?|watch(?:ing|ers)?|followers?|following|$))/); if (!match) return null; const number = Number(match[1]); if (Number.isNaN(number)) return null; diff --git a/store-assets/screenshot-1280x800.png b/store-assets/screenshot-1280x800.png new file mode 100644 index 0000000..a259217 Binary files /dev/null and b/store-assets/screenshot-1280x800.png differ diff --git a/store-assets/small-promo-440x280.png b/store-assets/small-promo-440x280.png new file mode 100644 index 0000000..8d54caa Binary files /dev/null and b/store-assets/small-promo-440x280.png differ diff --git a/store-assets/source-screenshot.png b/store-assets/source-screenshot.png new file mode 100644 index 0000000..8c6af4e Binary files /dev/null and b/store-assets/source-screenshot.png differ