diff --git a/config/options.go b/config/options.go index 959877bf..9c9ca3d3 100644 --- a/config/options.go +++ b/config/options.go @@ -93,6 +93,7 @@ const ( defBoltPathname = "wayback.db" defPoolingSize = 3 defMaxMediaSize = "512MB" + defCohereApiKey = "" defWaybackTimeout = 300 defWaybackMaxRetries = 2 defWaybackUserAgent = "WaybackArchiver/1.0" @@ -112,6 +113,10 @@ const ( defDatabaseMinConns = 1 defDatabaseConnectionLifetime = 5 + defLLMProvider = "" + defLLMApiKey = "" + defLLMModel = "" + maxAttachSizeTelegram = 50000000 // 50MB maxAttachSizeDiscord = 8000000 // 8MB maxAttachSizeSlack = 5000000000 // 5GB @@ -145,6 +150,7 @@ type Options struct { notion *notion matrix *matrix slack *slack + llm *llm services sync.Map privacyURL string storageDir string @@ -269,6 +275,12 @@ type meili struct { apikey string } +type llm struct { + provider string + apikey string + model string +} + type omnivore struct { apikey string } @@ -386,6 +398,11 @@ func NewOptions() *Options { indexing: defMeiliIndexing, apikey: defMeiliApikey, }, + llm: &llm{ + provider: defLLMProvider, + apikey: defLLMApiKey, + model: defLLMModel, + }, omnivore: &omnivore{ apikey: defOmnivoreApikey, }, @@ -951,6 +968,21 @@ func (o *Options) MaxMediaSize() uint64 { return size } +// LLMProvider returns the LLM provider. +func (o *Options) LLMProvider() string { + return o.llm.provider +} + +// LLMApiKey returns the apikey of LLM provider. +func (o *Options) LLMApiKey() string { + return o.llm.apikey +} + +// LLMModel returns the model of LLM provider. +func (o *Options) LLMModel() string { + return o.llm.model +} + // MaxAttachSize returns max attach size limits for several services. // scope: telegram func (o *Options) MaxAttachSize(scope string) int64 { diff --git a/config/parser.go b/config/parser.go index 76d233a0..0d3df878 100644 --- a/config/parser.go +++ b/config/parser.go @@ -237,6 +237,12 @@ func (p *Parser) parseLines(lines []string) (err error) { p.opts.meili.indexing = parseString(val, defMeiliIndexing) case "WAYBACK_MEILI_APIKEY": p.opts.meili.apikey = parseString(val, defMeiliApikey) + case "WAYBACK_LLM_PROVIDER": + p.opts.llm.provider = parseString(val, defLLMProvider) + case "WAYBACK_LLM_APIKEY": + p.opts.llm.apikey = parseString(val, defLLMApiKey) + case "WAYBACK_LLM_MODEL": + p.opts.llm.model = parseString(val, defLLMModel) case "WAYBACK_OMNIVORE_APIKEY": p.opts.omnivore.apikey = parseString(val, defOmnivoreApikey) case "WAYBACK_PRIVACY_URL": diff --git a/docs/environment.md b/docs/environment.md index b7265e05..ef15eb7f 100644 --- a/docs/environment.md +++ b/docs/environment.md @@ -102,6 +102,9 @@ Use the `-c` / `--config` option to specify the build definition file to use. | - | `WAYBACK_ONION_LOCAL_PORT` | `8964` | Local port for Tor Hidden Service, also support for a **reverse proxy**. This is ignored if `WAYBACK_LISTEN_ADDR` is set. | | - | `WAYBACK_ONION_REMOTE_PORTS` | `80` | Remote ports for Tor Hidden Service, e.g. `WAYBACK_ONION_REMOTE_PORTS=80,81` | | - | `WAYBACK_ONION_DISABLED` | `false` | Disable onion service | +| - | `WAYBACK_LLM_PROVIDER` | `` | Enables AI-enhanced summary | +| - | `WAYBACK_LLM_APIKEY` | `` | LLM API key | +| - | `WAYBACK_LLM_MODEL` | `` | LLM model. Each provider has a sensible default: cohere: command-a-03-2025 \| openrouter: openrouter/auto. | | - | `WAYBACK_SLOT` | - | Pinning service for IPFS mode of pinner, see [ipfs-pinner](https://github.com/wabarc/ipfs-pinner#supported-pinning-services) | | - | `WAYBACK_APIKEY` | - | API key for pinning service | | - | `WAYBACK_SECRET` | - | API secret for pinning service | diff --git a/go.mod b/go.mod index 5ffdd307..8ccca4b3 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/dghubble/go-twitter v0.0.0-20201011215211-4b180d0cc78d github.com/dghubble/oauth1 v0.7.1 + github.com/didasy/tldr v0.7.0 github.com/dstotijn/go-notion v0.11.0 github.com/dustin/go-humanize v1.0.0 github.com/gabriel-vasile/mimetype v1.4.2 @@ -66,6 +67,7 @@ require ( github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 // indirect github.com/SaveTheRbtz/generic-sync-map-go v0.0.0-20230201052002-6c5833b989be // indirect github.com/VividCortex/ewma v1.2.0 // indirect + github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c // indirect github.com/andybalholm/brotli v1.1.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect github.com/benbjohnson/clock v1.3.5 // indirect @@ -86,7 +88,7 @@ require ( github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect github.com/dghubble/sling v1.3.0 // indirect - github.com/dlclark/regexp2 v1.7.0 // indirect + github.com/dlclark/regexp2 v1.9.0 // indirect github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e // indirect github.com/fatih/color v1.16.0 // indirect github.com/fortytw2/leaktest v1.3.0 // indirect diff --git a/go.sum b/go.sum index f3e8375c..7ffbe835 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,8 @@ github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAU github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c h1:UUHM6/UM34ESICar/DWOhLt2rqYabsvfjmupiY9z+iE= +github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c/go.mod h1:e7Vic/xXDZAQ8ftWoLnVrXseAAvt54SVYrcirjCKcX0= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= @@ -118,9 +120,12 @@ github.com/dghubble/sling v1.3.0 h1:pZHjCJq4zJvc6qVQ5wN1jo5oNZlNE0+8T/h0XeXBUKU= github.com/dghubble/sling v1.3.0/go.mod h1:XXShWaBWKzNLhu2OxikSNFrlsvowtz4kyRuXUG7oQKY= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= +github.com/didasy/tldr v0.7.0 h1:9kFLpmeGeGPPIRysln8B9USbW+L5zAAlw9ol8gwc2gU= +github.com/didasy/tldr v0.7.0/go.mod h1:1W7p626SAyEeSkAAzFJLAG/Hr6imK7sxEr+K6x7e7Ao= github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc= -github.com/dlclark/regexp2 v1.7.0 h1:7lJfhqlPssTb1WQx4yvTHN0uElPEv52sbaECrAQxjAo= github.com/dlclark/regexp2 v1.7.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI= +github.com/dlclark/regexp2 v1.9.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dop251/goja v0.0.0-20211022113120-dc8c55024d06/go.mod h1:R9ET47fwRVRPZnOGvHxxhuZcbrMCuiqOz3Rlrh4KSnk= github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e h1:Uo51nR73BJlci20AE5tXT5qiLSGZy5LHnRlKt7VkcUM= github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e/go.mod h1:yRkwfj0CBpOGre+TwBsqPV0IH0Pk73e4PXJOeNDboGs= @@ -140,6 +145,8 @@ github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8 github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -351,6 +358,7 @@ github.com/multiformats/go-varint v0.0.7/go.mod h1:r8PUYw/fD/SjBCiKOoDlGF6QawOEL github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nbd-wtf/go-nostr v0.17.1-0.20230426111250-32ca737acf77 h1:D7BdjjOD0D8r7RwLmrOTOJKEZ56D9YhLCEETz2Xh0Vo= github.com/nbd-wtf/go-nostr v0.17.1-0.20230426111250-32ca737acf77/go.mod h1:YCDHJtaFQE76d1ZkcUsTkz3dYNP+bldo5CIQwXPPcbk= +github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/oliamb/cutter v0.2.2 h1:Lfwkya0HHNU1YLnGv2hTkzHfasrSMkgv4Dn+5rmlk3k= @@ -358,11 +366,14 @@ github.com/oliamb/cutter v0.2.2/go.mod h1:4BenG2/4GuRBDbVm/OPahDVqbrOemzpPiG5mi1 github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA= github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE= +github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= @@ -682,6 +693,7 @@ gopkg.in/sourcemap.v1 v1.0.5 h1:inv58fC9f9J3TK2Y2R1NPntXEn3/wjWHkonhIUODNTI= gopkg.in/sourcemap.v1 v1.0.5/go.mod h1:2RlvNNSMglmRrcvhfuzp4hQHwOtjxlbjX7UPY/GXb78= gopkg.in/telebot.v3 v3.0.0-20220130115853-f0291132d3c3 h1:ifpOmJCnVni31dBAw99qxgCRfD33ROgv7vYxuhu+iWc= gopkg.in/telebot.v3 v3.0.0-20220130115853-f0291132d3c3/go.mod h1:7rExV8/0mDDNu9epSrDm/8j22KLaActH1Tbee6YjzWg= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go index 14cb6e09..dd6c2cb5 100644 --- a/reduxer/reduxer.go +++ b/reduxer/reduxer.go @@ -27,6 +27,7 @@ import ( "github.com/wabarc/wayback/config" "github.com/wabarc/wayback/errors" "github.com/wabarc/wayback/ingress" + "github.com/wabarc/wayback/summary" "golang.org/x/sync/errgroup" ) @@ -57,6 +58,7 @@ type bundle struct { shots *screenshot.Screenshots[screenshot.Path] artifact Artifact article readability.Article + summary string } // Artifact represents the file paths stored on the local disk. @@ -135,6 +137,11 @@ func (b *bundle) Article() readability.Article { return b.article } +// Summary returns a summary of article. +func (b *bundle) Summary() string { + return b.summary +} + // Do executes secreenshot, print PDF and export html of given URLs // Returns a set of bundle containing screenshot data and file path // nolint:gocyclo @@ -221,11 +228,19 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e if err = os.WriteFile(fp, helper.String2Byte(article.TextContent), filePerm); err == nil && article.TextContent != "" { artifact.Txt.Local = fp } + + // Generate summary + summarizer := summary.NewSummary(opts) + sum, err := summarizer.Summarize(article.TextContent) + if err != nil { + logger.Error("sumarize failed: %v", err) + } + // Upload files to third-party server if err = remotely(ctx, artifact); err != nil { logger.Error("upload files to remote server failed: %v", err) } - bundle := &bundle{shots: shot, artifact: *artifact, article: article} + bundle := &bundle{shots: shot, artifact: *artifact, article: article, summary: sum} bs.Store(Src(shot.URL), bundle) return nil }) diff --git a/summary/chat.go b/summary/chat.go new file mode 100644 index 00000000..340be066 --- /dev/null +++ b/summary/chat.go @@ -0,0 +1,49 @@ +// Copyright 2026 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +const systemPrompt = `You are a digital archivist and information synthesizer, your expertise lies in distilling "noise" from legacy web data into high-signal summaries. + +Rules: +- Summary point must be anchored by specific verbatim quotes +- Ignore UI elements (navbars, footers) and focus on the core content +- Be objective, clinical, and precise. Strip away marketing fluff to reveal the underlying data +- Summary must be in the same language as the source content +- Do NOT repeat ideas from previous snapshots unless conditions have materially changed + +FORMATTING RULES (STRICT): +- STRICTOR PROHIBITION: Do not use Markdown bolding (**text**) +- Use ONLY plain text without any formatting +- Use simple line breaks to separate points +- Do NOT use headers or bold labels + +The output should be a maximum of 280 plain paragraphs.` + +type chatMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type chatRequest struct { + Model string `json:"model"` + Messages []chatMessage `json:"messages"` +} + +type chatContent struct { + Type string + Text string +} + +type chatChoice struct { + Message chatMessage `json:"message,omitempty"` + Role string `json:"role"` + Contents []chatContent `json:"content"` +} + +type chatResponse struct { + Message chatChoice `json:"message,omitempty"` + ID string `json:"id"` + Choices []chatChoice `json:"choices,omitempty"` +} diff --git a/summary/cohere.go b/summary/cohere.go new file mode 100644 index 00000000..0149273f --- /dev/null +++ b/summary/cohere.go @@ -0,0 +1,96 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "bytes" + "encoding/json" + "fmt" + "net/http" + "strings" + + "github.com/wabarc/wayback/config" + "github.com/wabarc/wayback/ingress" +) + +// Interface guard +var _ Summarizer = (*Cohere)(nil) + +// Cohere represents a text summarization algorithm powered by Cohere's AI models. +type Cohere struct { + client *http.Client + apiKey string + model string +} + +// NewCohere creates a `Cohere` instance with the specified `http.Client` instance and API key. +// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer +// to the newly created `Cohere` instance and an error, if any. +func NewCohere(c *http.Client, opts *config.Options) *Cohere { + if c == nil { + c = ingress.Client() + } + model := opts.LLMModel() + if model == "" { + model = "command-a-03-2025" + } + + return &Cohere{ + client: c, + apiKey: opts.LLMApiKey(), + model: model, + } +} + +// Summarize generates a summary of the input text using Cohere's AI models. +// Returns the generated summary as a string and an error, if any. +func (coh *Cohere) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + body := chatRequest{ + Model: coh.model, + Messages: []chatMessage{ + {Role: "system", Content: systemPrompt}, + {Role: "user", Content: s}, + }, + } + buf, err := json.Marshal(body) + if err != nil { + return "", fmt.Errorf("failed to marshal json: %v", err) + } + + endpoint := "https://api.cohere.ai/v2/chat" + req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(buf)) + if err != nil { + return "", fmt.Errorf("failed to make request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + req.Header.Set("Authorization", "Bearer "+coh.apiKey) + + res, err := coh.client.Do(req) + if err != nil { + return "", err + } + defer res.Body.Close() + + if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { + return "", fmt.Errorf("cohere api error: status %d", res.StatusCode) + } + + var cr chatResponse + if err := json.NewDecoder(res.Body).Decode(&cr); err != nil { + return "", fmt.Errorf("failed to decode body: %v", err) + } + + if len(cr.Message.Contents) > 0 && strings.TrimSpace(cr.Message.Contents[0].Text) != "" { + return strings.TrimSpace(cr.Message.Contents[0].Text), nil + } + + return s, nil +} diff --git a/summary/cohere_test.go b/summary/cohere_test.go new file mode 100644 index 00000000..67258b0d --- /dev/null +++ b/summary/cohere_test.go @@ -0,0 +1,194 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "encoding/json" + "fmt" + "net/http" + "os" + "strings" + "testing" + + "github.com/wabarc/helper" + "github.com/wabarc/wayback/config" +) + +var ( + apiKey = os.Getenv("WAYBACK_LLM_APIKEY") + summarized = "This is a summary of the test input." + summarizeResponse = []byte(fmt.Sprintf(`{ + "summary": "%s" +}`, summarized)) + + handleFunc = func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/v2/chat": + w.Write(summarizeResponse) + } + } +) + +func TestNewCohere(t *testing.T) { + httpClient, mux, server := helper.MockServer() + defer server.Close() + + mux.HandleFunc("/", handleFunc) + + tests := []struct { + desc string + client *http.Client + key string + expectErr bool + expectNil bool + }{ + { + desc: "Valid inputs", + client: httpClient, + key: "valid_api_key", + expectErr: false, + expectNil: false, + }, + { + desc: "Invalid API key", + client: httpClient, + key: apiKey, + expectErr: true, + expectNil: true, + }, + { + desc: "Nil http.Client", + client: nil, + key: apiKey, + expectErr: false, + expectNil: false, + }, + } + + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", tt.key) + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + cohere := NewCohere(tt.client, opts) + if !tt.expectNil && cohere == nil { + t.Errorf("Unexpected nil value for Cohere instance") + } + }) + } +} + +func TestCohereSummarize(t *testing.T) { + tests := []struct { + name string + input string + mockStatus int + mockBody string + expected string + expectedErr string + }{ + { + name: "Empty string", + input: "", + expected: "", + expectedErr: "text not found", + }, + { + name: "Valid input", + input: "This is a test input for summarization.", + mockStatus: 200, + mockBody: `{ + "messages":[ + {"role":"user","content":"This is the summary."} + ] + }`, + expected: "This is the summary.", + expectedErr: "", + }, + { + name: "API error status", + input: "Non-empty", + mockStatus: 500, + mockBody: `{"error":"server"}`, + expected: "", + expectedErr: "cohere api error: status 500", + }, + } + + httpClient, mux, server := helper.MockServer() + defer server.Close() + + // Register handler at expected endpoint path used by the client. + mux.HandleFunc("/v2/chat", func(w http.ResponseWriter, r *http.Request) { + // optional: assert method and headers + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + // Find matching test case by inspecting body or rely on sequential handling. + // For simplicity, read body and decide response based on test inputs: + var req struct { + Messages []struct { + Content string `json:"content"` + } `json:"messages"` + } + _ = json.NewDecoder(r.Body).Decode(&req) + r.Body.Close() + + switch { + case strings.Contains(req.Messages[1].Content, "This is a test input for summarization."): + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"This is the summary."}]}`)) + case strings.Contains(req.Messages[1].Content, "Non-empty"): + w.WriteHeader(500) + w.Write([]byte("server error")) + default: + // default success + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"ok"}]}`)) + } + }) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", "test-key") + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + coh := NewCohere(httpClient, opts) + + actual, actualErr := coh.Summarize(tt.input) + + if tt.expectedErr != "" { + if actualErr == nil { + t.Fatalf("expected error %q, got nil", tt.expectedErr) + } + if actualErr.Error() != tt.expectedErr { + t.Fatalf("unexpected error, got %q expected %q", actualErr.Error(), tt.expectedErr) + } + return + } + + if actualErr != nil { + t.Fatalf("unexpected error: %v", actualErr) + } + if actual != tt.expected { + t.Fatalf(`unexpected summary, got "%v" instead of "%v"`, actual, tt.expected) + } + }) + } +} diff --git a/summary/doc.go b/summary/doc.go new file mode 100644 index 00000000..77056c34 --- /dev/null +++ b/summary/doc.go @@ -0,0 +1,9 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +/* +Package summary is designed to provide a comprehensive set of tools for +automated text summarization. +*/ +package summary // import "github.com/wabarc/wayback/summary" diff --git a/summary/legacy.go b/summary/legacy.go new file mode 100644 index 00000000..e25f70bf --- /dev/null +++ b/summary/legacy.go @@ -0,0 +1,49 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "fmt" + "strings" + + "github.com/didasy/tldr" +) + +const maxCharacters = 128 + +// Interface guard +var _ Summarizer = (*Legacy)(nil) + +// Legacy implements the Summarizer interface using the tldr.Bag package to +// perform local summarization. +type Legacy struct { + *tldr.Bag +} + +// NewLegacy creates a new instance of the Legacy struct with a new tldr.Bag instance. +func NewLegacy() *Legacy { + return &Legacy{tldr.New()} +} + +// Summarize generates a summary of the input text using legacy summarization. +// It returns the summary as a string and any error that occurred during summarization. +func (l *Legacy) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + l.MaxCharacters = maxCharacters + res, err := l.Bag.Summarize(s, 1) + if err != nil { + return "", fmt.Errorf("summarize failed: %v", err) + } + + if len(res) == 0 { + return s, nil + } + + return res[0], nil +} diff --git a/summary/legacy_test.go b/summary/legacy_test.go new file mode 100644 index 00000000..2ea661a7 --- /dev/null +++ b/summary/legacy_test.go @@ -0,0 +1,55 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "testing" +) + +func TestLegacy(t *testing.T) { + // Define test cases as a slice of structs. + tests := []struct { + name string + input string + want string + wantErr bool + errMessage string + }{ + { + name: "valid input", + input: "This is a test string.", + want: "This is a test string.", + wantErr: false, + errMessage: "", + }, + { + name: "empty input", + input: "", + want: "", + wantErr: true, + errMessage: "text not found", + }, + } + + local := NewLegacy() + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := local.Summarize(tt.input) + + if (err != nil) != tt.wantErr { + t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr) + } + + if tt.wantErr && err.Error() != tt.errMessage { + t.Fatalf(`Unexpected error message. Got "%v", but wanted "%v"`, err.Error(), tt.errMessage) + } + + if !tt.wantErr && got != tt.want { + t.Fatalf(`Unexpected summary. Got "%v", but wanted "%v"`, got, tt.want) + } + }) + } +} diff --git a/summary/openrouter.go b/summary/openrouter.go new file mode 100644 index 00000000..56a06c07 --- /dev/null +++ b/summary/openrouter.go @@ -0,0 +1,95 @@ +// Copyright 2026 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "bytes" + "encoding/json" + "fmt" + "net/http" + "strings" + + "github.com/wabarc/wayback/config" + "github.com/wabarc/wayback/ingress" +) + +// Interface guard +var _ Summarizer = (*OpenRouter)(nil) + +// OpenRouter represents a text summarization client for OpenRouter LLM service. +type OpenRouter struct { + client *http.Client + apiKey string + model string +} + +// NewOpenRouter creates a `OpenRouter` instance with the specified `http.Client` and options. +// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer +// to the newly created `OpenRouter` instance and an error, if any. +func NewOpenRouter(c *http.Client, opts *config.Options) *OpenRouter { + if c == nil { + c = ingress.Client() + } + model := opts.LLMModel() + if model == "" { + model = "openrouter/auto" + } + + return &OpenRouter{ + client: c, + apiKey: opts.LLMApiKey(), + model: model, + } +} + +// Summarize generates a summary of the input text using OpenRouter's AI models. +// Returns the generated summary as a string and an error, if any. +func (or *OpenRouter) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + body := chatRequest{ + Model: or.model, + Messages: []chatMessage{ + {Role: "system", Content: systemPrompt}, + {Role: "user", Content: s}, + }, + } + buf, err := json.Marshal(body) + if err != nil { + return "", fmt.Errorf("failed to marshal json: %v", err) + } + + endpoint := "https://openrouter.ai/api/v1/chat/completions" + req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(buf)) + if err != nil { + return "", fmt.Errorf("failed to make request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+or.apiKey) + + res, err := or.client.Do(req) + if err != nil { + return "", err + } + defer res.Body.Close() + + if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { + return "", fmt.Errorf("openrouter api error: status %d", res.StatusCode) + } + + var cr chatResponse + if err := json.NewDecoder(res.Body).Decode(&cr); err != nil { + return "", fmt.Errorf("failed to decode body: %v", err) + } + + if len(cr.Choices) > 0 && strings.TrimSpace(cr.Choices[0].Message.Content) != "" { + return strings.TrimSpace(cr.Choices[0].Message.Content), nil + } + + return s, nil +} diff --git a/summary/openrouter_test.go b/summary/openrouter_test.go new file mode 100644 index 00000000..0b262735 --- /dev/null +++ b/summary/openrouter_test.go @@ -0,0 +1,183 @@ +// Copyright 2026 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "encoding/json" + "net/http" + "strings" + "testing" + + "github.com/wabarc/helper" + "github.com/wabarc/wayback/config" +) + +func TestNewOpenRouter(t *testing.T) { + httpClient, mux, server := helper.MockServer() + defer server.Close() + + handleFunc := func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/v1/chat/completions": + w.Write(summarizeResponse) + } + } + mux.HandleFunc("/", handleFunc) + + tests := []struct { + desc string + client *http.Client + key string + expectErr bool + expectNil bool + }{ + { + desc: "Valid inputs", + client: httpClient, + key: "valid_api_key", + expectErr: false, + expectNil: false, + }, + { + desc: "Invalid API key", + client: httpClient, + key: apiKey, + expectErr: true, + expectNil: true, + }, + { + desc: "Nil http.Client", + client: nil, + key: apiKey, + expectErr: false, + expectNil: false, + }, + } + + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", tt.key) + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + cohere := NewOpenRouter(tt.client, opts) + if !tt.expectNil && cohere == nil { + t.Errorf("Unexpected nil value for OpenRouter instance") + } + }) + } +} + +func TestOpenRouterSummarize(t *testing.T) { + tests := []struct { + name string + input string + mockStatus int + mockBody string + expected string + expectedErr string + }{ + { + name: "Empty string", + input: "", + expected: "", + expectedErr: "text not found", + }, + { + name: "Valid input", + input: "This is a test input for summarization.", + mockStatus: 200, + mockBody: `{ + "messages":[ + {"role":"user","content":"This is the summary."} + ] + }`, + expected: "This is the summary.", + expectedErr: "", + }, + { + name: "API error status", + input: "Non-empty", + mockStatus: 500, + mockBody: `{"error":"server"}`, + expected: "", + expectedErr: "cohere api error: status 500", + }, + } + + httpClient, mux, server := helper.MockServer() + defer server.Close() + + // Register handler at expected endpoint path used by the client. + mux.HandleFunc("/api/v1/chat/completions", func(w http.ResponseWriter, r *http.Request) { + // optional: assert method and headers + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + // Find matching test case by inspecting body or rely on sequential handling. + // For simplicity, read body and decide response based on test inputs: + var req struct { + Messages []struct { + Content string `json:"content"` + } `json:"messages"` + } + _ = json.NewDecoder(r.Body).Decode(&req) + r.Body.Close() + + switch { + case strings.Contains(req.Messages[1].Content, "This is a test input for summarization."): + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"This is the summary."}]}`)) + case strings.Contains(req.Messages[1].Content, "Non-empty"): + w.WriteHeader(500) + w.Write([]byte("server error")) + default: + // default success + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"ok"}]}`)) + } + }) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", "test-key") + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + coh := NewOpenRouter(httpClient, opts) + + actual, actualErr := coh.Summarize(tt.input) + + if tt.expectedErr != "" { + if actualErr == nil { + t.Fatalf("expected error %q, got nil", tt.expectedErr) + } + if actualErr.Error() != tt.expectedErr { + t.Fatalf("unexpected error, got %q expected %q", actualErr.Error(), tt.expectedErr) + } + return + } + + if actualErr != nil { + t.Fatalf("unexpected error: %v", actualErr) + } + if actual != tt.expected { + t.Fatalf(`unexpected summary, got "%v" instead of "%v"`, actual, tt.expected) + } + }) + } +} diff --git a/summary/summary.go b/summary/summary.go new file mode 100644 index 00000000..90f4ac31 --- /dev/null +++ b/summary/summary.go @@ -0,0 +1,34 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "strings" + + "github.com/wabarc/wayback/config" + "github.com/wabarc/wayback/ingress" +) + +// Summarizer is the interface that wraps the basic Summarize method. +// +// Summarize takes in a string of text and returns a summary. +type Summarizer interface { + Summarize(s string) (string, error) +} + +// NewSummary creates and returns a Summarizer based on the configured LLM provider. +// It inspects opts.LLMProvider() (case-insensitive) and constructs a provider-specific +// handler. It falls back to the legacy summarizer implementation. +// The returned Summarizer wraps the chosen handler. +func NewSummary(opts *config.Options) Summarizer { + switch strings.ToLower(opts.LLMProvider()) { + case "cohere": + return NewCohere(ingress.Client(), opts) + case "openrouter": + return NewOpenRouter(ingress.Client(), opts) + } + + return NewLegacy() +} diff --git a/summary/summary_test.go b/summary/summary_test.go new file mode 100644 index 00000000..0db62241 --- /dev/null +++ b/summary/summary_test.go @@ -0,0 +1,74 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "testing" + + "github.com/wabarc/helper" + "github.com/wabarc/wayback/config" +) + +func TestSummarize(t *testing.T) { + httpClient, mux, server := helper.MockServer() + defer server.Close() + + mux.HandleFunc("/", handleFunc) + + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", "test-key") + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + coh := NewCohere(httpClient, opts) + + tests := []struct { + name string + handler Summarizer + input string + wantErr bool + errMessage string + }{ + { + name: "Valid Cohere handler", + handler: coh, + input: "This is a test string.", + wantErr: false, + errMessage: "", + }, + { + name: "Valid Locally handler", + handler: NewLegacy(), + input: "This is a test string.", + wantErr: false, + errMessage: "", + }, + { + name: "Empty input", + handler: coh, + input: "", + wantErr: true, + errMessage: "text not found", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := tt.handler.Summarize(tt.input) + + if (err != nil) != tt.wantErr { + t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr) + } + + if tt.wantErr && err.Error() != tt.errMessage { + t.Fatalf(`Unexpected error message. Got "%v", but wanted "%v"`, err.Error(), tt.errMessage) + } + }) + } +} diff --git a/template/render/discord.go b/template/render/discord.go index 809ba788..b4ecfb54 100644 --- a/template/render/discord.go +++ b/template/render/discord.go @@ -59,7 +59,7 @@ func (d *Discord) ForPublish() (r *Render) { tmplBytes.WriteString("\n\n") } - if dgst := Digest(d.Cols, d.Data); dgst != "" { + if dgst := summaryOrDigest(d.Cols, d.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/template/render/github.go b/template/render/github.go index 81988f3a..28834bd9 100644 --- a/template/render/github.go +++ b/template/render/github.go @@ -33,7 +33,7 @@ func (gh *GitHub) ForReply() *Render { func (gh *GitHub) ForPublish() *Render { var tmplBytes bytes.Buffer - if dgst := Digest(gh.Cols, gh.Data); dgst != "" { + if dgst := summaryOrDigest(gh.Cols, gh.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/template/render/matrix.go b/template/render/matrix.go index a810408f..511c8ce7 100644 --- a/template/render/matrix.go +++ b/template/render/matrix.go @@ -66,7 +66,7 @@ func (m *Matrix) ForPublish() *Render { tmplBytes.WriteString(` ›

`) } - if dgst := Digest(m.Cols, m.Data); dgst != "" { + if dgst := summaryOrDigest(m.Cols, m.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString(`

`) } diff --git a/template/render/render.go b/template/render/render.go index a45c9c85..71192217 100644 --- a/template/render/render.go +++ b/template/render/render.go @@ -157,8 +157,8 @@ func Title(cols []wayback.Collect, rdx reduxer.Reduxer) (title string) { return } -// Digest returns digest of the webpage content. Its maximum length is defined by `maxDigestLen`. -func Digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { +// digest returns digest of the webpage content. Its maximum length is defined by `maxDigestLen`. +func digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { if rdx == nil { return } @@ -185,6 +185,42 @@ func Digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { return } +// summary returns summary of the webpage content. Its maximum length is defined by `maxDigestLen`. +func summary(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { + if rdx == nil { + return + } + + for uri := range deDepURI(cols) { + if bundle, ok := rdx.Load(reduxer.Src(uri)); ok { + if text := bundle.Summary(); text != "" { + logger.Debug("extracted summary from article content: %s", text) + t := []rune(text) + l := len(t) + switch { + case l == 0: + continue + case l > maxDigestLen: + t = t[:maxDigestLen] + dgst += string(t) + ` ...` + default: + dgst += string(t) + } + } + } + } + + return +} + +func summaryOrDigest(cols []wayback.Collect, rdx reduxer.Reduxer) string { + if sum := summary(cols, rdx); sum != "" { + return sum + } + + return digest(cols, rdx) +} + // writeArtifact writes archived artifact of the webpage. func writeArtifact(cols []wayback.Collect, rdx reduxer.Reduxer, fn func(art reduxer.Artifact)) { if rdx == nil { diff --git a/template/render/slack.go b/template/render/slack.go index 880c56d4..4027fb08 100644 --- a/template/render/slack.go +++ b/template/render/slack.go @@ -61,7 +61,7 @@ func (s *Slack) ForPublish() (r *Render) { tmplBytes.WriteString(" ›\n\n") } - if dgst := Digest(s.Cols, s.Data); dgst != "" { + if dgst := summaryOrDigest(s.Cols, s.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/template/render/telegram.go b/template/render/telegram.go index c64bb30f..b7c94fea 100644 --- a/template/render/telegram.go +++ b/template/render/telegram.go @@ -69,7 +69,7 @@ func (t *Telegram) ForPublish() (r *Render) { tmplBytes.WriteString("\n\n") } - if dgst := Digest(t.Cols, t.Data); dgst != "" { + if dgst := summaryOrDigest(t.Cols, t.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/wayback.1 b/wayback.1 index ae044242..9d6dcc70 100644 --- a/wayback.1 +++ b/wayback.1 @@ -224,6 +224,17 @@ Directory to store binary file, e.g. PDF, html file\&. .B WAYBACK_MAX_MEDIA_SIZE Max size to limit download stream media. default 512MB\&. .TP +.B WAYBACK_LLM_PROVIDER +Enables AI-enhanced summary. Provider options: cohere | openrouter\&. +.TP +.B WAYBACK_LLM_APIKEY +LLM API key\&. +.TP +.B WAYBACK_LLM_MODEL +LLM model. Each provider has a sensible default: +.br +cohere: command-a-03-2025 | openrouter: openrouter/auto\&. +.TP .B WAYBACK_MEDIA_SITES Extra media websites wish to be supported, separate with comma\&. .TP diff --git a/wayback.conf b/wayback.conf index 57a2bdd4..291c278d 100644 --- a/wayback.conf +++ b/wayback.conf @@ -76,6 +76,9 @@ WAYBACK_USERAGENT=WaybackArchiver/1.0 WAYBACK_FALLBACK=off WAYBACK_PROXY= WAYBACK_PRIVACY_URL= +WAYBACK_LLM_PROVIDER= +WAYBACK_LLM_APIKEY= +WAYBACK_LLM_MODEL= # ipfs slot: infura, pinata # doc: https://github.com/wabarc/ipfs-pinner#supported-pinning-services