From eafb06271e0c5936ceef751d8cd98baa995a30ea Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Fri, 5 May 2023 09:37:10 +0100 Subject: [PATCH 1/8] Add support for AI-powered summarization --- config/options.go | 8 +++ config/parser.go | 2 + docs/environment.md | 1 + go.mod | 4 +- go.sum | 7 +- reduxer/reduxer.go | 16 ++++- summary/cohere.go | 54 +++++++++++++++ summary/cohere_test.go | 134 ++++++++++++++++++++++++++++++++++++ summary/doc.go | 9 +++ summary/summary.go | 12 ++++ template/render/discord.go | 2 +- template/render/github.go | 2 +- template/render/matrix.go | 2 +- template/render/render.go | 40 ++++++++++- template/render/slack.go | 2 +- template/render/telegram.go | 2 +- wayback.1 | 3 + wayback.conf | 1 + 18 files changed, 291 insertions(+), 10 deletions(-) create mode 100644 summary/cohere.go create mode 100644 summary/cohere_test.go create mode 100644 summary/doc.go create mode 100644 summary/summary.go diff --git a/config/options.go b/config/options.go index 959877bf..ece12f82 100644 --- a/config/options.go +++ b/config/options.go @@ -93,6 +93,7 @@ const ( defBoltPathname = "wayback.db" defPoolingSize = 3 defMaxMediaSize = "512MB" + defCohereApiKey = "" defWaybackTimeout = 300 defWaybackMaxRetries = 2 defWaybackUserAgent = "WaybackArchiver/1.0" @@ -156,6 +157,7 @@ type Options struct { boltPathname string maxMediaSize string poolingSize int + cohereApiKey string waybackTimeout int waybackMaxRetries int enabledChromeRemote bool @@ -289,6 +291,7 @@ func NewOptions() *Options { storageDir: defStorageDir, maxMediaSize: defMaxMediaSize, privacyURL: defPrivacyURL, + cohereApiKey: defCohereApiKey, waybackTimeout: defWaybackTimeout, waybackMaxRetries: defWaybackMaxRetries, waybackUserAgent: defWaybackUserAgent, @@ -951,6 +954,11 @@ func (o *Options) MaxMediaSize() uint64 { return size } +// CohereApiKey returns the apikey of Cohere. +func (o *Options) CohereApiKey() string { + return o.cohereApiKey +} + // MaxAttachSize returns max attach size limits for several services. // scope: telegram func (o *Options) MaxAttachSize(scope string) int64 { diff --git a/config/parser.go b/config/parser.go index 76d233a0..00a00402 100644 --- a/config/parser.go +++ b/config/parser.go @@ -223,6 +223,8 @@ func (p *Parser) parseLines(lines []string) (err error) { p.opts.storageDir = parseString(val, defStorageDir) case "WAYBACK_MAX_MEDIA_SIZE": p.opts.maxMediaSize = parseString(val, defMaxMediaSize) + case "WAYBACK_COHERE_APIKEY": + p.opts.cohereApiKey = parseString(val, defCohereApiKey) case "WAYBACK_TIMEOUT": p.opts.waybackTimeout = parseInt(val, defWaybackTimeout) case "WAYBACK_MAX_RETRIES": diff --git a/docs/environment.md b/docs/environment.md index b7265e05..a2a15de1 100644 --- a/docs/environment.md +++ b/docs/environment.md @@ -32,6 +32,7 @@ Use the `-c` / `--config` option to specify the build definition file to use. | - | `WAYBACK_BOLT_PATH` | `./wayback.db` | File path of bolt database | | - | `WAYBACK_STORAGE_DIR` | - | Directory to store binary file, e.g. PDF, html file | | - | `WAYBACK_MAX_MEDIA_SIZE` | `512MB` | Max size to limit download stream media | +| - | `WAYBACK_COHERE_APIKEY` | `` | Cohere API key | | - | `WAYBACK_MEDIA_SITES` | - | Extra media websites wish to be supported, separate with comma | | - | `WAYBACK_TIMEOUT` | `300` | Timeout for single wayback request, defaults to 300 second | | - | `WAYBACK_MAX_RETRIES` | `2` | Max retries for single wayback request, defaults to 2 | diff --git a/go.mod b/go.mod index 5ffdd307..c19d7d3a 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ go 1.24.0 require ( github.com/PuerkitoBio/goquery v1.9.0 github.com/bwmarrin/discordgo v0.28.1 + github.com/cohere-ai/cohere-go v0.2.0 github.com/cretz/bine v0.2.0 github.com/davecgh/go-spew v1.1.1 github.com/dghubble/go-twitter v0.0.0-20201011215211-4b180d0cc78d @@ -82,11 +83,12 @@ require ( github.com/chromedp/chromedp v0.9.5 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/cloudflare/circl v1.3.7 // indirect + github.com/cohere-ai/tokenizer v1.1.2 // indirect github.com/crackcomm/go-gitignore v0.0.0-20170627025303-887ab5e44cc3 // indirect github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect github.com/dghubble/sling v1.3.0 // indirect - github.com/dlclark/regexp2 v1.7.0 // indirect + github.com/dlclark/regexp2 v1.9.0 // indirect github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e // indirect github.com/fatih/color v1.16.0 // indirect github.com/fortytw2/leaktest v1.3.0 // indirect diff --git a/go.sum b/go.sum index f3e8375c..eaca4e93 100644 --- a/go.sum +++ b/go.sum @@ -84,6 +84,10 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU= github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= +github.com/cohere-ai/cohere-go v0.2.0 h1:Gljkn8LTtsAPy79ks1AVmZH9Av4kuQuXEgzEJ/1Ea34= +github.com/cohere-ai/cohere-go v0.2.0/go.mod h1:DFcCu5rwro4wAlluIXY9l17NLGiVBGb2bRio46RXBm8= +github.com/cohere-ai/tokenizer v1.1.2 h1:t3KwUBSpKiBVFtpnHBfVIQNmjfZUuqFVYuSFkZYOWpU= +github.com/cohere-ai/tokenizer v1.1.2/go.mod h1:9MNFPd9j1fuiEK3ua2HSCUxxcrfGMlSqpa93livg/C0= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= @@ -119,8 +123,9 @@ github.com/dghubble/sling v1.3.0/go.mod h1:XXShWaBWKzNLhu2OxikSNFrlsvowtz4kyRuXU github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc= -github.com/dlclark/regexp2 v1.7.0 h1:7lJfhqlPssTb1WQx4yvTHN0uElPEv52sbaECrAQxjAo= github.com/dlclark/regexp2 v1.7.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI= +github.com/dlclark/regexp2 v1.9.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dop251/goja v0.0.0-20211022113120-dc8c55024d06/go.mod h1:R9ET47fwRVRPZnOGvHxxhuZcbrMCuiqOz3Rlrh4KSnk= github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e h1:Uo51nR73BJlci20AE5tXT5qiLSGZy5LHnRlKt7VkcUM= github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e/go.mod h1:yRkwfj0CBpOGre+TwBsqPV0IH0Pk73e4PXJOeNDboGs= diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go index 14cb6e09..b1f55634 100644 --- a/reduxer/reduxer.go +++ b/reduxer/reduxer.go @@ -27,6 +27,7 @@ import ( "github.com/wabarc/wayback/config" "github.com/wabarc/wayback/errors" "github.com/wabarc/wayback/ingress" + "github.com/wabarc/wayback/summary" "golang.org/x/sync/errgroup" ) @@ -57,6 +58,7 @@ type bundle struct { shots *screenshot.Screenshots[screenshot.Path] artifact Artifact article readability.Article + summary string } // Artifact represents the file paths stored on the local disk. @@ -135,6 +137,11 @@ func (b *bundle) Article() readability.Article { return b.article } +// Summary returns a summary of article. +func (b *bundle) Summary() string { + return b.summary +} + // Do executes secreenshot, print PDF and export html of given URLs // Returns a set of bundle containing screenshot data and file path // nolint:gocyclo @@ -221,11 +228,18 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e if err = os.WriteFile(fp, helper.String2Byte(article.TextContent), filePerm); err == nil && article.TextContent != "" { artifact.Txt.Local = fp } + + // Generate summary + sum := "" + if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil { + sum, _ = coh.Summarize(article.TextContent) // nolint:errcheck + } + // Upload files to third-party server if err = remotely(ctx, artifact); err != nil { logger.Error("upload files to remote server failed: %v", err) } - bundle := &bundle{shots: shot, artifact: *artifact, article: article} + bundle := &bundle{shots: shot, artifact: *artifact, article: article, summary: sum} bs.Store(Src(shot.URL), bundle) return nil }) diff --git a/summary/cohere.go b/summary/cohere.go new file mode 100644 index 00000000..fb9bcf4d --- /dev/null +++ b/summary/cohere.go @@ -0,0 +1,54 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "fmt" + "net/http" + "strings" + + "github.com/cohere-ai/cohere-go" +) + +// Interface guard +var _ Summarizer = (*Cohere)(nil) + +// Cohere represents a text summarization algorithm powered by Cohere's AI models. +type Cohere struct { + client *cohere.Client +} + +// NewCohere creates a `Cohere` instance with the specified `http.Client` instance and API key. +// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer +// to the newly created `Cohere` instance and an error, if any. +func NewCohere(c *http.Client, key string) (*Cohere, error) { + coh, err := cohere.CreateClient(key) + if err != nil { + return nil, err + } + if c != nil { + coh.Client = *c + } + + return &Cohere{coh}, nil +} + +// Summarize generates a summary of the input text using Cohere's AI models. +// Returns the generated summary as a string and an error, if any. +func (coh *Cohere) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + res, err := coh.client.Summarize(cohere.SummarizeOptions{ + Text: s, + }) + if err != nil { + return "", err + } + + return res.Summary, nil +} diff --git a/summary/cohere_test.go b/summary/cohere_test.go new file mode 100644 index 00000000..de22e341 --- /dev/null +++ b/summary/cohere_test.go @@ -0,0 +1,134 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "fmt" + "net/http" + "os" + "reflect" + "testing" + + "github.com/cohere-ai/cohere-go" + "github.com/wabarc/helper" +) + +var ( + apiKey = os.Getenv("COHERE_APIKEY") + summarized = "This is a summary of the test input." + summarizeResponse = []byte(fmt.Sprintf(`{ + "summary": "%s" +}`, summarized)) + + handleFunc = func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/summarize": + w.Write(summarizeResponse) + } + } +) + +func TestNewCohere(t *testing.T) { + if apiKey == "" { + t.Skip(`Must set env "COHERE_APIKEY"`) + } + + httpClient, mux, server := helper.MockServer() + defer server.Close() + + mux.HandleFunc("/", handleFunc) + + tests := []struct { + desc string + client *http.Client + key string + expectErr bool + expectNil bool + }{ + { + desc: "Valid inputs", + client: httpClient, + key: "valid_api_key", + expectErr: false, + expectNil: false, + }, + { + desc: "Invalid API key", + client: httpClient, + key: apiKey, + expectErr: true, + expectNil: true, + }, + { + desc: "Nil http.Client", + client: nil, + key: apiKey, + expectErr: false, + expectNil: false, + }, + } + + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + cohere, err := NewCohere(tt.client, tt.key) + if tt.expectErr && err == nil { + t.Errorf("Expected error but got nil") + } + if tt.expectNil && cohere != nil { + t.Errorf("Expected nil value for Cohere instance") + } + if !tt.expectNil && cohere == nil { + t.Errorf("Unexpected nil value for Cohere instance") + } + }) + } +} + +func TestCohere_Summarize(t *testing.T) { + tests := []struct { + name string + input string + expected string + expectedErr error + }{ + { + name: "Empty string", + input: "", + expected: "", + expectedErr: fmt.Errorf("text not found"), + }, + { + name: "Valid input", + input: "This is a test input for summarization.", + expected: summarized, + expectedErr: nil, + }, + } + + httpClient, mux, server := helper.MockServer() + defer server.Close() + + mux.HandleFunc("/", handleFunc) + + cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"} + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + coh := &Cohere{client: cohereClient} + + // Call the Summarize method + actual, actualErr := coh.Summarize(tt.input) + + // Check the results + if tt.expected != actual { + t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actual, tt.expected) + } + if !reflect.DeepEqual(tt.expectedErr, actualErr) { + t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actualErr, tt.expectedErr) + } + }) + } +} diff --git a/summary/doc.go b/summary/doc.go new file mode 100644 index 00000000..77056c34 --- /dev/null +++ b/summary/doc.go @@ -0,0 +1,9 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +/* +Package summary is designed to provide a comprehensive set of tools for +automated text summarization. +*/ +package summary // import "github.com/wabarc/wayback/summary" diff --git a/summary/summary.go b/summary/summary.go new file mode 100644 index 00000000..3523b01f --- /dev/null +++ b/summary/summary.go @@ -0,0 +1,12 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +// Summarizer is the interface that wraps the basic Summarize method. +// +// Summarize takes in a string of text and returns a summary. +type Summarizer interface { + Summarize(s string) (string, error) +} diff --git a/template/render/discord.go b/template/render/discord.go index 809ba788..b4ecfb54 100644 --- a/template/render/discord.go +++ b/template/render/discord.go @@ -59,7 +59,7 @@ func (d *Discord) ForPublish() (r *Render) { tmplBytes.WriteString("\n\n") } - if dgst := Digest(d.Cols, d.Data); dgst != "" { + if dgst := summaryOrDigest(d.Cols, d.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/template/render/github.go b/template/render/github.go index 81988f3a..28834bd9 100644 --- a/template/render/github.go +++ b/template/render/github.go @@ -33,7 +33,7 @@ func (gh *GitHub) ForReply() *Render { func (gh *GitHub) ForPublish() *Render { var tmplBytes bytes.Buffer - if dgst := Digest(gh.Cols, gh.Data); dgst != "" { + if dgst := summaryOrDigest(gh.Cols, gh.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/template/render/matrix.go b/template/render/matrix.go index a810408f..511c8ce7 100644 --- a/template/render/matrix.go +++ b/template/render/matrix.go @@ -66,7 +66,7 @@ func (m *Matrix) ForPublish() *Render { tmplBytes.WriteString(` ›

`) } - if dgst := Digest(m.Cols, m.Data); dgst != "" { + if dgst := summaryOrDigest(m.Cols, m.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString(`

`) } diff --git a/template/render/render.go b/template/render/render.go index a45c9c85..71192217 100644 --- a/template/render/render.go +++ b/template/render/render.go @@ -157,8 +157,8 @@ func Title(cols []wayback.Collect, rdx reduxer.Reduxer) (title string) { return } -// Digest returns digest of the webpage content. Its maximum length is defined by `maxDigestLen`. -func Digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { +// digest returns digest of the webpage content. Its maximum length is defined by `maxDigestLen`. +func digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { if rdx == nil { return } @@ -185,6 +185,42 @@ func Digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { return } +// summary returns summary of the webpage content. Its maximum length is defined by `maxDigestLen`. +func summary(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) { + if rdx == nil { + return + } + + for uri := range deDepURI(cols) { + if bundle, ok := rdx.Load(reduxer.Src(uri)); ok { + if text := bundle.Summary(); text != "" { + logger.Debug("extracted summary from article content: %s", text) + t := []rune(text) + l := len(t) + switch { + case l == 0: + continue + case l > maxDigestLen: + t = t[:maxDigestLen] + dgst += string(t) + ` ...` + default: + dgst += string(t) + } + } + } + } + + return +} + +func summaryOrDigest(cols []wayback.Collect, rdx reduxer.Reduxer) string { + if sum := summary(cols, rdx); sum != "" { + return sum + } + + return digest(cols, rdx) +} + // writeArtifact writes archived artifact of the webpage. func writeArtifact(cols []wayback.Collect, rdx reduxer.Reduxer, fn func(art reduxer.Artifact)) { if rdx == nil { diff --git a/template/render/slack.go b/template/render/slack.go index 880c56d4..4027fb08 100644 --- a/template/render/slack.go +++ b/template/render/slack.go @@ -61,7 +61,7 @@ func (s *Slack) ForPublish() (r *Render) { tmplBytes.WriteString(" ›\n\n") } - if dgst := Digest(s.Cols, s.Data); dgst != "" { + if dgst := summaryOrDigest(s.Cols, s.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/template/render/telegram.go b/template/render/telegram.go index c64bb30f..b7c94fea 100644 --- a/template/render/telegram.go +++ b/template/render/telegram.go @@ -69,7 +69,7 @@ func (t *Telegram) ForPublish() (r *Render) { tmplBytes.WriteString("\n\n") } - if dgst := Digest(t.Cols, t.Data); dgst != "" { + if dgst := summaryOrDigest(t.Cols, t.Data); dgst != "" { tmplBytes.WriteString(dgst) tmplBytes.WriteString("\n\n") } diff --git a/wayback.1 b/wayback.1 index ae044242..d89cd1aa 100644 --- a/wayback.1 +++ b/wayback.1 @@ -224,6 +224,9 @@ Directory to store binary file, e.g. PDF, html file\&. .B WAYBACK_MAX_MEDIA_SIZE Max size to limit download stream media. default 512MB\&. .TP +.B WAYBACK_COHERE_APIKEY +Cohere API key\&. +.TP .B WAYBACK_MEDIA_SITES Extra media websites wish to be supported, separate with comma\&. .TP diff --git a/wayback.conf b/wayback.conf index 57a2bdd4..b227e50c 100644 --- a/wayback.conf +++ b/wayback.conf @@ -76,6 +76,7 @@ WAYBACK_USERAGENT=WaybackArchiver/1.0 WAYBACK_FALLBACK=off WAYBACK_PROXY= WAYBACK_PRIVACY_URL= +WAYBACK_COHERE_APIKEY= # ipfs slot: infura, pinata # doc: https://github.com/wabarc/ipfs-pinner#supported-pinning-services From b996400700f352c4cbbaa1dbb3658acd35a1bb18 Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Tue, 9 May 2023 16:33:00 +0100 Subject: [PATCH 2/8] Generating summaries using a local algorithm --- go.mod | 2 ++ go.sum | 11 ++++++ summary/locally.go | 49 +++++++++++++++++++++++++++ summary/locally_test.go | 55 ++++++++++++++++++++++++++++++ summary/summary.go | 32 ++++++++++++++++++ summary/summary_test.go | 75 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 224 insertions(+) create mode 100644 summary/locally.go create mode 100644 summary/locally_test.go create mode 100644 summary/summary_test.go diff --git a/go.mod b/go.mod index c19d7d3a..ecb46dbd 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ module github.com/wabarc/wayback go 1.24.0 require ( + github.com/JesusIslam/tldr v0.6.0 github.com/PuerkitoBio/goquery v1.9.0 github.com/bwmarrin/discordgo v0.28.1 github.com/cohere-ai/cohere-go v0.2.0 @@ -67,6 +68,7 @@ require ( github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 // indirect github.com/SaveTheRbtz/generic-sync-map-go v0.0.0-20230201052002-6c5833b989be // indirect github.com/VividCortex/ewma v1.2.0 // indirect + github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c // indirect github.com/andybalholm/brotli v1.1.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect github.com/benbjohnson/clock v1.3.5 // indirect diff --git a/go.sum b/go.sum index eaca4e93..eb2d5781 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/JesusIslam/tldr v0.6.0 h1:b5jc9m77g9vs9iREKSitBWhyC6YdemtqjAqiCJycwt0= +github.com/JesusIslam/tldr v0.6.0/go.mod h1:qnHomoqHP4q5qvOPggMBAnq7PB1V0CGF3+Dr4pcos74= github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 h1:EtZwYyLbkEcIt+B//6sujwRCnHuTEK3qiSypAX5aJeM= github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403/go.mod h1:mM6WvakkX2m+NgMiPCfFFjwfH4KzENC07zeGEqq9U7s= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= @@ -16,6 +18,8 @@ github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAU github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c h1:UUHM6/UM34ESICar/DWOhLt2rqYabsvfjmupiY9z+iE= +github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c/go.mod h1:e7Vic/xXDZAQ8ftWoLnVrXseAAvt54SVYrcirjCKcX0= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= @@ -145,6 +149,8 @@ github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8 github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -356,6 +362,7 @@ github.com/multiformats/go-varint v0.0.7/go.mod h1:r8PUYw/fD/SjBCiKOoDlGF6QawOEL github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nbd-wtf/go-nostr v0.17.1-0.20230426111250-32ca737acf77 h1:D7BdjjOD0D8r7RwLmrOTOJKEZ56D9YhLCEETz2Xh0Vo= github.com/nbd-wtf/go-nostr v0.17.1-0.20230426111250-32ca737acf77/go.mod h1:YCDHJtaFQE76d1ZkcUsTkz3dYNP+bldo5CIQwXPPcbk= +github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/oliamb/cutter v0.2.2 h1:Lfwkya0HHNU1YLnGv2hTkzHfasrSMkgv4Dn+5rmlk3k= @@ -363,11 +370,14 @@ github.com/oliamb/cutter v0.2.2/go.mod h1:4BenG2/4GuRBDbVm/OPahDVqbrOemzpPiG5mi1 github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA= github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE= +github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= @@ -687,6 +697,7 @@ gopkg.in/sourcemap.v1 v1.0.5 h1:inv58fC9f9J3TK2Y2R1NPntXEn3/wjWHkonhIUODNTI= gopkg.in/sourcemap.v1 v1.0.5/go.mod h1:2RlvNNSMglmRrcvhfuzp4hQHwOtjxlbjX7UPY/GXb78= gopkg.in/telebot.v3 v3.0.0-20220130115853-f0291132d3c3 h1:ifpOmJCnVni31dBAw99qxgCRfD33ROgv7vYxuhu+iWc= gopkg.in/telebot.v3 v3.0.0-20220130115853-f0291132d3c3/go.mod h1:7rExV8/0mDDNu9epSrDm/8j22KLaActH1Tbee6YjzWg= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/summary/locally.go b/summary/locally.go new file mode 100644 index 00000000..4e48e1a8 --- /dev/null +++ b/summary/locally.go @@ -0,0 +1,49 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "fmt" + "strings" + + "github.com/JesusIslam/tldr" +) + +const maxCharacters = 128 + +// Interface guard +var _ Summarizer = (*Locally)(nil) + +// Locally implements the Summarizer interface using the tldr.Bag package to +// perform local summarization. +type Locally struct { + *tldr.Bag +} + +// NewLocally creates a new instance of the Locally struct with a new tldr.Bag instance. +func NewLocally() *Locally { + return &Locally{tldr.New()} +} + +// Summarize generates a summary of the input text using local summarization. +// It returns the summary as a string and any error that occurred during summarization. +func (l *Locally) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + l.Bag.MaxCharacters = maxCharacters + res, err := l.Bag.Summarize(s, 1) + if err != nil { + return "", fmt.Errorf("summarize failed: %v", err) + } + + if len(res) == 0 { + return s, nil + } + + return res[0], nil +} diff --git a/summary/locally_test.go b/summary/locally_test.go new file mode 100644 index 00000000..da8f1d62 --- /dev/null +++ b/summary/locally_test.go @@ -0,0 +1,55 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "testing" +) + +func TestLocally(t *testing.T) { + // Define test cases as a slice of structs. + tests := []struct { + name string + input string + want string + wantErr bool + errMessage string + }{ + { + name: "valid input", + input: "This is a test string.", + want: "This is a test string.", + wantErr: false, + errMessage: "", + }, + { + name: "empty input", + input: "", + want: "", + wantErr: true, + errMessage: "text not found", + }, + } + + local := NewLocally() + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := local.Summarize(tt.input) + + if (err != nil) != tt.wantErr { + t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr) + } + + if tt.wantErr && err.Error() != tt.errMessage { + t.Fatalf(`Unexpected error message. Got "%v", but wanted "%v"`, err.Error(), tt.errMessage) + } + + if !tt.wantErr && got != tt.want { + t.Fatalf(`Unexpected summary. Got "%v", but wanted "%v"`, got, tt.want) + } + }) + } +} diff --git a/summary/summary.go b/summary/summary.go index 3523b01f..87b3b5b3 100644 --- a/summary/summary.go +++ b/summary/summary.go @@ -4,9 +4,41 @@ package summary // import "github.com/wabarc/wayback/summary" +import ( + "fmt" + "strings" +) + // Summarizer is the interface that wraps the basic Summarize method. // // Summarize takes in a string of text and returns a summary. type Summarizer interface { Summarize(s string) (string, error) } + +// Interface guard +var _ Summarizer = (*Summary)(nil) + +// Summary provides a high-level interface for generating text summaries using +// different summarization methods. +type Summary struct { + Handler interface{} +} + +// Summarize generates a summary of the input text using the selected summarization method. +// It returns the summary as a string and any error that occurred during summarization. +func (sum *Summary) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + switch handler := sum.Handler.(type) { + case *Cohere: + return handler.Summarize(s) + case *Locally: + return handler.Summarize(s) + default: + return "", fmt.Errorf("invalid handler") + } +} diff --git a/summary/summary_test.go b/summary/summary_test.go new file mode 100644 index 00000000..00d72c88 --- /dev/null +++ b/summary/summary_test.go @@ -0,0 +1,75 @@ +// Copyright 2023 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "testing" + + "github.com/cohere-ai/cohere-go" + "github.com/wabarc/helper" +) + +func TestSummarize(t *testing.T) { + httpClient, mux, server := helper.MockServer() + defer server.Close() + + mux.HandleFunc("/", handleFunc) + + cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"} + coh := &Cohere{client: cohereClient} + + tests := []struct { + name string + handler interface{} + input string + wantErr bool + errMessage string + }{ + { + name: "valid Cohere handler", + handler: coh, + input: "This is a test string.", + wantErr: false, + errMessage: "", + }, + { + name: "valid Locally handler", + handler: NewLocally(), + input: "This is a test string.", + wantErr: false, + errMessage: "", + }, + { + name: "invalid handler", + handler: "invalid-handler", + input: "This is a test string.", + wantErr: true, + errMessage: "invalid handler", + }, + { + name: "empty input", + handler: coh, + input: "", + wantErr: true, + errMessage: "text not found", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sum := &Summary{Handler: tt.handler} + + _, err := sum.Summarize(tt.input) + + if (err != nil) != tt.wantErr { + t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr) + } + + if tt.wantErr && err.Error() != tt.errMessage { + t.Fatalf(`Unexpected error message. Got "%v", but wanted "%v"`, err.Error(), tt.errMessage) + } + }) + } +} From 55986d6aa1b75d54c249592931c8560aa1e5b6d9 Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Tue, 9 May 2023 16:41:01 +0100 Subject: [PATCH 3/8] Locally-based summarization is the default --- reduxer/reduxer.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go index b1f55634..095fb103 100644 --- a/reduxer/reduxer.go +++ b/reduxer/reduxer.go @@ -230,10 +230,11 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e } // Generate summary - sum := "" + tldr := &summary.Summary{Handler: summary.NewLocally()} if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil { - sum, _ = coh.Summarize(article.TextContent) // nolint:errcheck + tldr = &summary.Summary{Handler: coh} } + sum, _ := tldr.Summarize(article.TextContent) // nolint:errcheck // Upload files to third-party server if err = remotely(ctx, artifact); err != nil { From 33186b0f2e974fc18b36acb90951b8d8b947a515 Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Sun, 26 Apr 2026 04:06:02 +0000 Subject: [PATCH 4/8] Rename locally to legacy --- reduxer/reduxer.go | 2 +- summary/{locally.go => legacy.go} | 18 +++++++++--------- summary/{locally_test.go => legacy_test.go} | 4 ++-- summary/summary.go | 2 +- summary/summary_test.go | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) rename summary/{locally.go => legacy.go} (62%) rename summary/{locally_test.go => legacy_test.go} (95%) diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go index 095fb103..31f297dc 100644 --- a/reduxer/reduxer.go +++ b/reduxer/reduxer.go @@ -230,7 +230,7 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e } // Generate summary - tldr := &summary.Summary{Handler: summary.NewLocally()} + tldr := &summary.Summary{Handler: summary.NewLegacy()} if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil { tldr = &summary.Summary{Handler: coh} } diff --git a/summary/locally.go b/summary/legacy.go similarity index 62% rename from summary/locally.go rename to summary/legacy.go index 4e48e1a8..b69fed9f 100644 --- a/summary/locally.go +++ b/summary/legacy.go @@ -8,28 +8,28 @@ import ( "fmt" "strings" - "github.com/JesusIslam/tldr" + "github.com/didasy/tldr" ) const maxCharacters = 128 // Interface guard -var _ Summarizer = (*Locally)(nil) +var _ Summarizer = (*Legacy)(nil) -// Locally implements the Summarizer interface using the tldr.Bag package to +// Legacy implements the Summarizer interface using the tldr.Bag package to // perform local summarization. -type Locally struct { +type Legacy struct { *tldr.Bag } -// NewLocally creates a new instance of the Locally struct with a new tldr.Bag instance. -func NewLocally() *Locally { - return &Locally{tldr.New()} +// NewLegacy creates a new instance of the Legacy struct with a new tldr.Bag instance. +func NewLegacy() *Legacy { + return &Legacy{tldr.New()} } -// Summarize generates a summary of the input text using local summarization. +// Summarize generates a summary of the input text using legacy summarization. // It returns the summary as a string and any error that occurred during summarization. -func (l *Locally) Summarize(s string) (string, error) { +func (l *Legacy) Summarize(s string) (string, error) { s = strings.TrimSpace(s) if s == "" { return "", fmt.Errorf("text not found") diff --git a/summary/locally_test.go b/summary/legacy_test.go similarity index 95% rename from summary/locally_test.go rename to summary/legacy_test.go index da8f1d62..2ea661a7 100644 --- a/summary/locally_test.go +++ b/summary/legacy_test.go @@ -8,7 +8,7 @@ import ( "testing" ) -func TestLocally(t *testing.T) { +func TestLegacy(t *testing.T) { // Define test cases as a slice of structs. tests := []struct { name string @@ -33,7 +33,7 @@ func TestLocally(t *testing.T) { }, } - local := NewLocally() + local := NewLegacy() for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/summary/summary.go b/summary/summary.go index 87b3b5b3..552d6cf2 100644 --- a/summary/summary.go +++ b/summary/summary.go @@ -36,7 +36,7 @@ func (sum *Summary) Summarize(s string) (string, error) { switch handler := sum.Handler.(type) { case *Cohere: return handler.Summarize(s) - case *Locally: + case *Legacy: return handler.Summarize(s) default: return "", fmt.Errorf("invalid handler") diff --git a/summary/summary_test.go b/summary/summary_test.go index 00d72c88..19566be4 100644 --- a/summary/summary_test.go +++ b/summary/summary_test.go @@ -36,7 +36,7 @@ func TestSummarize(t *testing.T) { }, { name: "valid Locally handler", - handler: NewLocally(), + handler: NewLegacy(), input: "This is a test string.", wantErr: false, errMessage: "", From 01c9450bb47394ead17997756f82c292f8007490 Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:43:16 +0000 Subject: [PATCH 5/8] Add openrouter provider --- config/options.go | 34 ++++++- config/parser.go | 8 +- docs/environment.md | 4 +- go.mod | 4 +- go.sum | 8 +- reduxer/reduxer.go | 8 +- summary/chat.go | 44 +++++++++ summary/cohere.go | 68 +++++++++++--- summary/cohere_test.go | 122 ++++++++++++++++++------- summary/openrouter.go | 95 +++++++++++++++++++ summary/openrouter_test.go | 183 +++++++++++++++++++++++++++++++++++++ summary/summary.go | 32 +++---- summary/summary_test.go | 33 ++++--- wayback.1 | 12 ++- wayback.conf | 4 +- 15 files changed, 556 insertions(+), 103 deletions(-) create mode 100644 summary/chat.go create mode 100644 summary/openrouter.go create mode 100644 summary/openrouter_test.go diff --git a/config/options.go b/config/options.go index ece12f82..9c9ca3d3 100644 --- a/config/options.go +++ b/config/options.go @@ -113,6 +113,10 @@ const ( defDatabaseMinConns = 1 defDatabaseConnectionLifetime = 5 + defLLMProvider = "" + defLLMApiKey = "" + defLLMModel = "" + maxAttachSizeTelegram = 50000000 // 50MB maxAttachSizeDiscord = 8000000 // 8MB maxAttachSizeSlack = 5000000000 // 5GB @@ -146,6 +150,7 @@ type Options struct { notion *notion matrix *matrix slack *slack + llm *llm services sync.Map privacyURL string storageDir string @@ -157,7 +162,6 @@ type Options struct { boltPathname string maxMediaSize string poolingSize int - cohereApiKey string waybackTimeout int waybackMaxRetries int enabledChromeRemote bool @@ -271,6 +275,12 @@ type meili struct { apikey string } +type llm struct { + provider string + apikey string + model string +} + type omnivore struct { apikey string } @@ -291,7 +301,6 @@ func NewOptions() *Options { storageDir: defStorageDir, maxMediaSize: defMaxMediaSize, privacyURL: defPrivacyURL, - cohereApiKey: defCohereApiKey, waybackTimeout: defWaybackTimeout, waybackMaxRetries: defWaybackMaxRetries, waybackUserAgent: defWaybackUserAgent, @@ -389,6 +398,11 @@ func NewOptions() *Options { indexing: defMeiliIndexing, apikey: defMeiliApikey, }, + llm: &llm{ + provider: defLLMProvider, + apikey: defLLMApiKey, + model: defLLMModel, + }, omnivore: &omnivore{ apikey: defOmnivoreApikey, }, @@ -954,9 +968,19 @@ func (o *Options) MaxMediaSize() uint64 { return size } -// CohereApiKey returns the apikey of Cohere. -func (o *Options) CohereApiKey() string { - return o.cohereApiKey +// LLMProvider returns the LLM provider. +func (o *Options) LLMProvider() string { + return o.llm.provider +} + +// LLMApiKey returns the apikey of LLM provider. +func (o *Options) LLMApiKey() string { + return o.llm.apikey +} + +// LLMModel returns the model of LLM provider. +func (o *Options) LLMModel() string { + return o.llm.model } // MaxAttachSize returns max attach size limits for several services. diff --git a/config/parser.go b/config/parser.go index 00a00402..0d3df878 100644 --- a/config/parser.go +++ b/config/parser.go @@ -223,8 +223,6 @@ func (p *Parser) parseLines(lines []string) (err error) { p.opts.storageDir = parseString(val, defStorageDir) case "WAYBACK_MAX_MEDIA_SIZE": p.opts.maxMediaSize = parseString(val, defMaxMediaSize) - case "WAYBACK_COHERE_APIKEY": - p.opts.cohereApiKey = parseString(val, defCohereApiKey) case "WAYBACK_TIMEOUT": p.opts.waybackTimeout = parseInt(val, defWaybackTimeout) case "WAYBACK_MAX_RETRIES": @@ -239,6 +237,12 @@ func (p *Parser) parseLines(lines []string) (err error) { p.opts.meili.indexing = parseString(val, defMeiliIndexing) case "WAYBACK_MEILI_APIKEY": p.opts.meili.apikey = parseString(val, defMeiliApikey) + case "WAYBACK_LLM_PROVIDER": + p.opts.llm.provider = parseString(val, defLLMProvider) + case "WAYBACK_LLM_APIKEY": + p.opts.llm.apikey = parseString(val, defLLMApiKey) + case "WAYBACK_LLM_MODEL": + p.opts.llm.model = parseString(val, defLLMModel) case "WAYBACK_OMNIVORE_APIKEY": p.opts.omnivore.apikey = parseString(val, defOmnivoreApikey) case "WAYBACK_PRIVACY_URL": diff --git a/docs/environment.md b/docs/environment.md index a2a15de1..ef15eb7f 100644 --- a/docs/environment.md +++ b/docs/environment.md @@ -32,7 +32,6 @@ Use the `-c` / `--config` option to specify the build definition file to use. | - | `WAYBACK_BOLT_PATH` | `./wayback.db` | File path of bolt database | | - | `WAYBACK_STORAGE_DIR` | - | Directory to store binary file, e.g. PDF, html file | | - | `WAYBACK_MAX_MEDIA_SIZE` | `512MB` | Max size to limit download stream media | -| - | `WAYBACK_COHERE_APIKEY` | `` | Cohere API key | | - | `WAYBACK_MEDIA_SITES` | - | Extra media websites wish to be supported, separate with comma | | - | `WAYBACK_TIMEOUT` | `300` | Timeout for single wayback request, defaults to 300 second | | - | `WAYBACK_MAX_RETRIES` | `2` | Max retries for single wayback request, defaults to 2 | @@ -103,6 +102,9 @@ Use the `-c` / `--config` option to specify the build definition file to use. | - | `WAYBACK_ONION_LOCAL_PORT` | `8964` | Local port for Tor Hidden Service, also support for a **reverse proxy**. This is ignored if `WAYBACK_LISTEN_ADDR` is set. | | - | `WAYBACK_ONION_REMOTE_PORTS` | `80` | Remote ports for Tor Hidden Service, e.g. `WAYBACK_ONION_REMOTE_PORTS=80,81` | | - | `WAYBACK_ONION_DISABLED` | `false` | Disable onion service | +| - | `WAYBACK_LLM_PROVIDER` | `` | Enables AI-enhanced summary | +| - | `WAYBACK_LLM_APIKEY` | `` | LLM API key | +| - | `WAYBACK_LLM_MODEL` | `` | LLM model. Each provider has a sensible default: cohere: command-a-03-2025 \| openrouter: openrouter/auto. | | - | `WAYBACK_SLOT` | - | Pinning service for IPFS mode of pinner, see [ipfs-pinner](https://github.com/wabarc/ipfs-pinner#supported-pinning-services) | | - | `WAYBACK_APIKEY` | - | API key for pinning service | | - | `WAYBACK_SECRET` | - | API secret for pinning service | diff --git a/go.mod b/go.mod index ecb46dbd..8ccca4b3 100644 --- a/go.mod +++ b/go.mod @@ -5,14 +5,13 @@ module github.com/wabarc/wayback go 1.24.0 require ( - github.com/JesusIslam/tldr v0.6.0 github.com/PuerkitoBio/goquery v1.9.0 github.com/bwmarrin/discordgo v0.28.1 - github.com/cohere-ai/cohere-go v0.2.0 github.com/cretz/bine v0.2.0 github.com/davecgh/go-spew v1.1.1 github.com/dghubble/go-twitter v0.0.0-20201011215211-4b180d0cc78d github.com/dghubble/oauth1 v0.7.1 + github.com/didasy/tldr v0.7.0 github.com/dstotijn/go-notion v0.11.0 github.com/dustin/go-humanize v1.0.0 github.com/gabriel-vasile/mimetype v1.4.2 @@ -85,7 +84,6 @@ require ( github.com/chromedp/chromedp v0.9.5 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/cloudflare/circl v1.3.7 // indirect - github.com/cohere-ai/tokenizer v1.1.2 // indirect github.com/crackcomm/go-gitignore v0.0.0-20170627025303-887ab5e44cc3 // indirect github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect diff --git a/go.sum b/go.sum index eb2d5781..7ffbe835 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,6 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/JesusIslam/tldr v0.6.0 h1:b5jc9m77g9vs9iREKSitBWhyC6YdemtqjAqiCJycwt0= -github.com/JesusIslam/tldr v0.6.0/go.mod h1:qnHomoqHP4q5qvOPggMBAnq7PB1V0CGF3+Dr4pcos74= github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 h1:EtZwYyLbkEcIt+B//6sujwRCnHuTEK3qiSypAX5aJeM= github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403/go.mod h1:mM6WvakkX2m+NgMiPCfFFjwfH4KzENC07zeGEqq9U7s= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= @@ -88,10 +86,6 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU= github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= -github.com/cohere-ai/cohere-go v0.2.0 h1:Gljkn8LTtsAPy79ks1AVmZH9Av4kuQuXEgzEJ/1Ea34= -github.com/cohere-ai/cohere-go v0.2.0/go.mod h1:DFcCu5rwro4wAlluIXY9l17NLGiVBGb2bRio46RXBm8= -github.com/cohere-ai/tokenizer v1.1.2 h1:t3KwUBSpKiBVFtpnHBfVIQNmjfZUuqFVYuSFkZYOWpU= -github.com/cohere-ai/tokenizer v1.1.2/go.mod h1:9MNFPd9j1fuiEK3ua2HSCUxxcrfGMlSqpa93livg/C0= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= @@ -126,6 +120,8 @@ github.com/dghubble/sling v1.3.0 h1:pZHjCJq4zJvc6qVQ5wN1jo5oNZlNE0+8T/h0XeXBUKU= github.com/dghubble/sling v1.3.0/go.mod h1:XXShWaBWKzNLhu2OxikSNFrlsvowtz4kyRuXUG7oQKY= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= +github.com/didasy/tldr v0.7.0 h1:9kFLpmeGeGPPIRysln8B9USbW+L5zAAlw9ol8gwc2gU= +github.com/didasy/tldr v0.7.0/go.mod h1:1W7p626SAyEeSkAAzFJLAG/Hr6imK7sxEr+K6x7e7Ao= github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc= github.com/dlclark/regexp2 v1.7.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI= diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go index 31f297dc..dd6c2cb5 100644 --- a/reduxer/reduxer.go +++ b/reduxer/reduxer.go @@ -230,11 +230,11 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e } // Generate summary - tldr := &summary.Summary{Handler: summary.NewLegacy()} - if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil { - tldr = &summary.Summary{Handler: coh} + summarizer := summary.NewSummary(opts) + sum, err := summarizer.Summarize(article.TextContent) + if err != nil { + logger.Error("sumarize failed: %v", err) } - sum, _ := tldr.Summarize(article.TextContent) // nolint:errcheck // Upload files to third-party server if err = remotely(ctx, artifact); err != nil { diff --git a/summary/chat.go b/summary/chat.go new file mode 100644 index 00000000..2f913479 --- /dev/null +++ b/summary/chat.go @@ -0,0 +1,44 @@ +// Copyright 2026 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +const systemPrompt = `You are a digital archivist and information synthesiser, your expertise lies in distilling "noise" from legacy web data into high-signal summaries. + + +Rules: +- Summary point must be anchored by specific verbatim quotes +- Ignore UI elements (navbars, footers) and focus on the core content +- Be objective, clinical, and precise. Strip away marketing fluff to reveal the underlying data +- Summary must be in the same language as the source content +- Do NOT repeat ideas from previous snapshots unless conditions have materially changed + +The output should be a maximum of 280 plain text characters.` + +type chatMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type chatRequest struct { + Messages []chatMessage `json:"messages"` + Model string `json:"model"` +} + +type chatContent struct { + Type string + Text string +} + +type chatChoice struct { + Contents []chatContent `json:"content"` + Message chatMessage `json:"message,omitempty"` + Role string `json:"role"` +} + +type chatResponse struct { + Message chatChoice `json:"message,omitempty"` + Choices []chatChoice `json:"choices,omitempty"` + ID string `json:"id"` +} diff --git a/summary/cohere.go b/summary/cohere.go index fb9bcf4d..0149273f 100644 --- a/summary/cohere.go +++ b/summary/cohere.go @@ -5,11 +5,14 @@ package summary // import "github.com/wabarc/wayback/summary" import ( + "bytes" + "encoding/json" "fmt" "net/http" "strings" - "github.com/cohere-ai/cohere-go" + "github.com/wabarc/wayback/config" + "github.com/wabarc/wayback/ingress" ) // Interface guard @@ -17,22 +20,28 @@ var _ Summarizer = (*Cohere)(nil) // Cohere represents a text summarization algorithm powered by Cohere's AI models. type Cohere struct { - client *cohere.Client + client *http.Client + apiKey string + model string } // NewCohere creates a `Cohere` instance with the specified `http.Client` instance and API key. // If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer // to the newly created `Cohere` instance and an error, if any. -func NewCohere(c *http.Client, key string) (*Cohere, error) { - coh, err := cohere.CreateClient(key) - if err != nil { - return nil, err +func NewCohere(c *http.Client, opts *config.Options) *Cohere { + if c == nil { + c = ingress.Client() } - if c != nil { - coh.Client = *c + model := opts.LLMModel() + if model == "" { + model = "command-a-03-2025" } - return &Cohere{coh}, nil + return &Cohere{ + client: c, + apiKey: opts.LLMApiKey(), + model: model, + } } // Summarize generates a summary of the input text using Cohere's AI models. @@ -43,12 +52,45 @@ func (coh *Cohere) Summarize(s string) (string, error) { return "", fmt.Errorf("text not found") } - res, err := coh.client.Summarize(cohere.SummarizeOptions{ - Text: s, - }) + body := chatRequest{ + Model: coh.model, + Messages: []chatMessage{ + {Role: "system", Content: systemPrompt}, + {Role: "user", Content: s}, + }, + } + buf, err := json.Marshal(body) + if err != nil { + return "", fmt.Errorf("failed to marshal json: %v", err) + } + + endpoint := "https://api.cohere.ai/v2/chat" + req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(buf)) + if err != nil { + return "", fmt.Errorf("failed to make request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + req.Header.Set("Authorization", "Bearer "+coh.apiKey) + + res, err := coh.client.Do(req) if err != nil { return "", err } + defer res.Body.Close() + + if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { + return "", fmt.Errorf("cohere api error: status %d", res.StatusCode) + } + + var cr chatResponse + if err := json.NewDecoder(res.Body).Decode(&cr); err != nil { + return "", fmt.Errorf("failed to decode body: %v", err) + } + + if len(cr.Message.Contents) > 0 && strings.TrimSpace(cr.Message.Contents[0].Text) != "" { + return strings.TrimSpace(cr.Message.Contents[0].Text), nil + } - return res.Summary, nil + return s, nil } diff --git a/summary/cohere_test.go b/summary/cohere_test.go index de22e341..67258b0d 100644 --- a/summary/cohere_test.go +++ b/summary/cohere_test.go @@ -5,18 +5,19 @@ package summary // import "github.com/wabarc/wayback/summary" import ( + "encoding/json" "fmt" "net/http" "os" - "reflect" + "strings" "testing" - "github.com/cohere-ai/cohere-go" "github.com/wabarc/helper" + "github.com/wabarc/wayback/config" ) var ( - apiKey = os.Getenv("COHERE_APIKEY") + apiKey = os.Getenv("WAYBACK_LLM_APIKEY") summarized = "This is a summary of the test input." summarizeResponse = []byte(fmt.Sprintf(`{ "summary": "%s" @@ -25,17 +26,13 @@ var ( handleFunc = func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") switch r.URL.Path { - case "/summarize": + case "/v2/chat": w.Write(summarizeResponse) } } ) func TestNewCohere(t *testing.T) { - if apiKey == "" { - t.Skip(`Must set env "COHERE_APIKEY"`) - } - httpClient, mux, server := helper.MockServer() defer server.Close() @@ -73,13 +70,16 @@ func TestNewCohere(t *testing.T) { for _, tt := range tests { t.Run(tt.desc, func(t *testing.T) { - cohere, err := NewCohere(tt.client, tt.key) - if tt.expectErr && err == nil { - t.Errorf("Expected error but got nil") - } - if tt.expectNil && cohere != nil { - t.Errorf("Expected nil value for Cohere instance") + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", tt.key) + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) } + + cohere := NewCohere(tt.client, opts) if !tt.expectNil && cohere == nil { t.Errorf("Unexpected nil value for Cohere instance") } @@ -87,47 +87,107 @@ func TestNewCohere(t *testing.T) { } } -func TestCohere_Summarize(t *testing.T) { +func TestCohereSummarize(t *testing.T) { tests := []struct { name string input string + mockStatus int + mockBody string expected string - expectedErr error + expectedErr string }{ { name: "Empty string", input: "", expected: "", - expectedErr: fmt.Errorf("text not found"), + expectedErr: "text not found", }, { - name: "Valid input", - input: "This is a test input for summarization.", - expected: summarized, - expectedErr: nil, + name: "Valid input", + input: "This is a test input for summarization.", + mockStatus: 200, + mockBody: `{ + "messages":[ + {"role":"user","content":"This is the summary."} + ] + }`, + expected: "This is the summary.", + expectedErr: "", + }, + { + name: "API error status", + input: "Non-empty", + mockStatus: 500, + mockBody: `{"error":"server"}`, + expected: "", + expectedErr: "cohere api error: status 500", }, } httpClient, mux, server := helper.MockServer() defer server.Close() - mux.HandleFunc("/", handleFunc) - - cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"} + // Register handler at expected endpoint path used by the client. + mux.HandleFunc("/v2/chat", func(w http.ResponseWriter, r *http.Request) { + // optional: assert method and headers + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + // Find matching test case by inspecting body or rely on sequential handling. + // For simplicity, read body and decide response based on test inputs: + var req struct { + Messages []struct { + Content string `json:"content"` + } `json:"messages"` + } + _ = json.NewDecoder(r.Body).Decode(&req) + r.Body.Close() + + switch { + case strings.Contains(req.Messages[1].Content, "This is a test input for summarization."): + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"This is the summary."}]}`)) + case strings.Contains(req.Messages[1].Content, "Non-empty"): + w.WriteHeader(500) + w.Write([]byte("server error")) + default: + // default success + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"ok"}]}`)) + } + }) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - coh := &Cohere{client: cohereClient} + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", "test-key") + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + coh := NewCohere(httpClient, opts) - // Call the Summarize method actual, actualErr := coh.Summarize(tt.input) - // Check the results - if tt.expected != actual { - t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actual, tt.expected) + if tt.expectedErr != "" { + if actualErr == nil { + t.Fatalf("expected error %q, got nil", tt.expectedErr) + } + if actualErr.Error() != tt.expectedErr { + t.Fatalf("unexpected error, got %q expected %q", actualErr.Error(), tt.expectedErr) + } + return + } + + if actualErr != nil { + t.Fatalf("unexpected error: %v", actualErr) } - if !reflect.DeepEqual(tt.expectedErr, actualErr) { - t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actualErr, tt.expectedErr) + if actual != tt.expected { + t.Fatalf(`unexpected summary, got "%v" instead of "%v"`, actual, tt.expected) } }) } diff --git a/summary/openrouter.go b/summary/openrouter.go new file mode 100644 index 00000000..d6cd87f1 --- /dev/null +++ b/summary/openrouter.go @@ -0,0 +1,95 @@ +// Copyright 2026 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "bytes" + "encoding/json" + "fmt" + "net/http" + "strings" + + "github.com/wabarc/wayback/config" + "github.com/wabarc/wayback/ingress" +) + +// Interface guard +var _ Summarizer = (*OpenRouter)(nil) + +// OpenRouter represents a text summarization client for OpenRouter LLM service. +type OpenRouter struct { + client *http.Client + apiKey string + model string +} + +// NewOpenRouter creates a `OpenRouter` instance with the specified `http.Client` and options. +// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer +// to the newly created `OpenRouter` instance and an error, if any. +func NewOpenRouter(c *http.Client, opts *config.Options) *OpenRouter { + if c == nil { + c = ingress.Client() + } + model := opts.LLMModel() + if model == "" { + model = "openrouter/auto" + } + + return &OpenRouter{ + client: c, + apiKey: opts.LLMApiKey(), + model: model, + } +} + +// Summarize generates a summary of the input text using OpenRouter's AI models. +// Returns the generated summary as a string and an error, if any. +func (coh *OpenRouter) Summarize(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "", fmt.Errorf("text not found") + } + + body := chatRequest{ + Model: coh.model, + Messages: []chatMessage{ + {Role: "system", Content: systemPrompt}, + {Role: "user", Content: s}, + }, + } + buf, err := json.Marshal(body) + if err != nil { + return "", fmt.Errorf("failed to marshal json: %v", err) + } + + endpoint := "https://openrouter.ai/api/v1/chat/completions" + req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(buf)) + if err != nil { + return "", fmt.Errorf("failed to make request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+coh.apiKey) + + res, err := coh.client.Do(req) + if err != nil { + return "", err + } + defer res.Body.Close() + + if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { + return "", fmt.Errorf("cohere api error: status %d", res.StatusCode) + } + + var cr chatResponse + if err := json.NewDecoder(res.Body).Decode(&cr); err != nil { + return "", fmt.Errorf("failed to decode body: %v", err) + } + + if len(cr.Choices) > 0 && strings.TrimSpace(cr.Choices[0].Message.Content) != "" { + return strings.TrimSpace(cr.Choices[0].Message.Content), nil + } + + return s, nil +} diff --git a/summary/openrouter_test.go b/summary/openrouter_test.go new file mode 100644 index 00000000..0b262735 --- /dev/null +++ b/summary/openrouter_test.go @@ -0,0 +1,183 @@ +// Copyright 2026 Wayback Archiver. All rights reserved. +// Use of this source code is governed by the GNU GPL v3 +// license that can be found in the LICENSE file. + +package summary // import "github.com/wabarc/wayback/summary" + +import ( + "encoding/json" + "net/http" + "strings" + "testing" + + "github.com/wabarc/helper" + "github.com/wabarc/wayback/config" +) + +func TestNewOpenRouter(t *testing.T) { + httpClient, mux, server := helper.MockServer() + defer server.Close() + + handleFunc := func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/v1/chat/completions": + w.Write(summarizeResponse) + } + } + mux.HandleFunc("/", handleFunc) + + tests := []struct { + desc string + client *http.Client + key string + expectErr bool + expectNil bool + }{ + { + desc: "Valid inputs", + client: httpClient, + key: "valid_api_key", + expectErr: false, + expectNil: false, + }, + { + desc: "Invalid API key", + client: httpClient, + key: apiKey, + expectErr: true, + expectNil: true, + }, + { + desc: "Nil http.Client", + client: nil, + key: apiKey, + expectErr: false, + expectNil: false, + }, + } + + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", tt.key) + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + cohere := NewOpenRouter(tt.client, opts) + if !tt.expectNil && cohere == nil { + t.Errorf("Unexpected nil value for OpenRouter instance") + } + }) + } +} + +func TestOpenRouterSummarize(t *testing.T) { + tests := []struct { + name string + input string + mockStatus int + mockBody string + expected string + expectedErr string + }{ + { + name: "Empty string", + input: "", + expected: "", + expectedErr: "text not found", + }, + { + name: "Valid input", + input: "This is a test input for summarization.", + mockStatus: 200, + mockBody: `{ + "messages":[ + {"role":"user","content":"This is the summary."} + ] + }`, + expected: "This is the summary.", + expectedErr: "", + }, + { + name: "API error status", + input: "Non-empty", + mockStatus: 500, + mockBody: `{"error":"server"}`, + expected: "", + expectedErr: "cohere api error: status 500", + }, + } + + httpClient, mux, server := helper.MockServer() + defer server.Close() + + // Register handler at expected endpoint path used by the client. + mux.HandleFunc("/api/v1/chat/completions", func(w http.ResponseWriter, r *http.Request) { + // optional: assert method and headers + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + // Find matching test case by inspecting body or rely on sequential handling. + // For simplicity, read body and decide response based on test inputs: + var req struct { + Messages []struct { + Content string `json:"content"` + } `json:"messages"` + } + _ = json.NewDecoder(r.Body).Decode(&req) + r.Body.Close() + + switch { + case strings.Contains(req.Messages[1].Content, "This is a test input for summarization."): + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"This is the summary."}]}`)) + case strings.Contains(req.Messages[1].Content, "Non-empty"): + w.WriteHeader(500) + w.Write([]byte("server error")) + default: + // default success + w.WriteHeader(200) + w.Write([]byte(`{"messages":[{"role":"assistant","content":"ok"}]}`)) + } + }) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", "test-key") + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + coh := NewOpenRouter(httpClient, opts) + + actual, actualErr := coh.Summarize(tt.input) + + if tt.expectedErr != "" { + if actualErr == nil { + t.Fatalf("expected error %q, got nil", tt.expectedErr) + } + if actualErr.Error() != tt.expectedErr { + t.Fatalf("unexpected error, got %q expected %q", actualErr.Error(), tt.expectedErr) + } + return + } + + if actualErr != nil { + t.Fatalf("unexpected error: %v", actualErr) + } + if actual != tt.expected { + t.Fatalf(`unexpected summary, got "%v" instead of "%v"`, actual, tt.expected) + } + }) + } +} diff --git a/summary/summary.go b/summary/summary.go index 552d6cf2..edf1da5d 100644 --- a/summary/summary.go +++ b/summary/summary.go @@ -5,8 +5,10 @@ package summary // import "github.com/wabarc/wayback/summary" import ( - "fmt" "strings" + + "github.com/wabarc/wayback/config" + "github.com/wabarc/wayback/ingress" ) // Summarizer is the interface that wraps the basic Summarize method. @@ -16,29 +18,23 @@ type Summarizer interface { Summarize(s string) (string, error) } -// Interface guard -var _ Summarizer = (*Summary)(nil) - // Summary provides a high-level interface for generating text summaries using // different summarization methods. type Summary struct { Handler interface{} } -// Summarize generates a summary of the input text using the selected summarization method. -// It returns the summary as a string and any error that occurred during summarization. -func (sum *Summary) Summarize(s string) (string, error) { - s = strings.TrimSpace(s) - if s == "" { - return "", fmt.Errorf("text not found") +// NewSummary creates and returns a Summarizer based on the configured LLM provider. +// It inspects opts.LLMProvider() (case-insensitive) and constructs a provider-specific +// handler. It falls back to the legacy summarizer implementation. +// The returned Summarizer wraps the chosen handler. +func NewSummary(opts *config.Options) Summarizer { + switch strings.ToLower(opts.LLMProvider()) { + case "cohere": + return NewCohere(ingress.Client(), opts) + case "openrouter": + return NewOpenRouter(ingress.Client(), opts) } - switch handler := sum.Handler.(type) { - case *Cohere: - return handler.Summarize(s) - case *Legacy: - return handler.Summarize(s) - default: - return "", fmt.Errorf("invalid handler") - } + return NewLegacy() } diff --git a/summary/summary_test.go b/summary/summary_test.go index 19566be4..0db62241 100644 --- a/summary/summary_test.go +++ b/summary/summary_test.go @@ -7,8 +7,8 @@ package summary // import "github.com/wabarc/wayback/summary" import ( "testing" - "github.com/cohere-ai/cohere-go" "github.com/wabarc/helper" + "github.com/wabarc/wayback/config" ) func TestSummarize(t *testing.T) { @@ -17,39 +17,40 @@ func TestSummarize(t *testing.T) { mux.HandleFunc("/", handleFunc) - cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"} - coh := &Cohere{client: cohereClient} + t.Setenv("WAYBACK_LLM_PROVIDER", "cohere") + t.Setenv("WAYBACK_LLM_APIKEY", "test-key") + + parser := config.NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf("Parse environment variables or flags failed, error: %v", err) + } + + coh := NewCohere(httpClient, opts) tests := []struct { name string - handler interface{} + handler Summarizer input string wantErr bool errMessage string }{ { - name: "valid Cohere handler", + name: "Valid Cohere handler", handler: coh, input: "This is a test string.", wantErr: false, errMessage: "", }, { - name: "valid Locally handler", + name: "Valid Locally handler", handler: NewLegacy(), input: "This is a test string.", wantErr: false, errMessage: "", }, { - name: "invalid handler", - handler: "invalid-handler", - input: "This is a test string.", - wantErr: true, - errMessage: "invalid handler", - }, - { - name: "empty input", + name: "Empty input", handler: coh, input: "", wantErr: true, @@ -59,9 +60,7 @@ func TestSummarize(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - sum := &Summary{Handler: tt.handler} - - _, err := sum.Summarize(tt.input) + _, err := tt.handler.Summarize(tt.input) if (err != nil) != tt.wantErr { t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr) diff --git a/wayback.1 b/wayback.1 index d89cd1aa..9d6dcc70 100644 --- a/wayback.1 +++ b/wayback.1 @@ -224,8 +224,16 @@ Directory to store binary file, e.g. PDF, html file\&. .B WAYBACK_MAX_MEDIA_SIZE Max size to limit download stream media. default 512MB\&. .TP -.B WAYBACK_COHERE_APIKEY -Cohere API key\&. +.B WAYBACK_LLM_PROVIDER +Enables AI-enhanced summary. Provider options: cohere | openrouter\&. +.TP +.B WAYBACK_LLM_APIKEY +LLM API key\&. +.TP +.B WAYBACK_LLM_MODEL +LLM model. Each provider has a sensible default: +.br +cohere: command-a-03-2025 | openrouter: openrouter/auto\&. .TP .B WAYBACK_MEDIA_SITES Extra media websites wish to be supported, separate with comma\&. diff --git a/wayback.conf b/wayback.conf index b227e50c..291c278d 100644 --- a/wayback.conf +++ b/wayback.conf @@ -76,7 +76,9 @@ WAYBACK_USERAGENT=WaybackArchiver/1.0 WAYBACK_FALLBACK=off WAYBACK_PROXY= WAYBACK_PRIVACY_URL= -WAYBACK_COHERE_APIKEY= +WAYBACK_LLM_PROVIDER= +WAYBACK_LLM_APIKEY= +WAYBACK_LLM_MODEL= # ipfs slot: infura, pinata # doc: https://github.com/wabarc/ipfs-pinner#supported-pinning-services From c841bb70caa43ee765bd71c80e1f216fd8cf7f36 Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:56:47 +0000 Subject: [PATCH 6/8] Make linter happy --- summary/chat.go | 8 ++++---- summary/legacy.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/summary/chat.go b/summary/chat.go index 2f913479..35383ec4 100644 --- a/summary/chat.go +++ b/summary/chat.go @@ -4,7 +4,7 @@ package summary // import "github.com/wabarc/wayback/summary" -const systemPrompt = `You are a digital archivist and information synthesiser, your expertise lies in distilling "noise" from legacy web data into high-signal summaries. +const systemPrompt = `You are a digital archivist and information synthesizer, your expertise lies in distilling "noise" from legacy web data into high-signal summaries. Rules: @@ -22,8 +22,8 @@ type chatMessage struct { } type chatRequest struct { - Messages []chatMessage `json:"messages"` Model string `json:"model"` + Messages []chatMessage `json:"messages"` } type chatContent struct { @@ -32,13 +32,13 @@ type chatContent struct { } type chatChoice struct { - Contents []chatContent `json:"content"` Message chatMessage `json:"message,omitempty"` Role string `json:"role"` + Contents []chatContent `json:"content"` } type chatResponse struct { Message chatChoice `json:"message,omitempty"` - Choices []chatChoice `json:"choices,omitempty"` ID string `json:"id"` + Choices []chatChoice `json:"choices,omitempty"` } diff --git a/summary/legacy.go b/summary/legacy.go index b69fed9f..e25f70bf 100644 --- a/summary/legacy.go +++ b/summary/legacy.go @@ -35,7 +35,7 @@ func (l *Legacy) Summarize(s string) (string, error) { return "", fmt.Errorf("text not found") } - l.Bag.MaxCharacters = maxCharacters + l.MaxCharacters = maxCharacters res, err := l.Bag.Summarize(s, 1) if err != nil { return "", fmt.Errorf("summarize failed: %v", err) From a07631ef27304d414ce21e6f978dfab1e6728d98 Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Sun, 3 May 2026 03:19:31 +0000 Subject: [PATCH 7/8] Minor changes --- summary/chat.go | 3 +-- summary/openrouter.go | 10 +++++----- summary/summary.go | 6 ------ 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/summary/chat.go b/summary/chat.go index 35383ec4..eb477ac6 100644 --- a/summary/chat.go +++ b/summary/chat.go @@ -6,7 +6,6 @@ package summary // import "github.com/wabarc/wayback/summary" const systemPrompt = `You are a digital archivist and information synthesizer, your expertise lies in distilling "noise" from legacy web data into high-signal summaries. - Rules: - Summary point must be anchored by specific verbatim quotes - Ignore UI elements (navbars, footers) and focus on the core content @@ -14,7 +13,7 @@ Rules: - Summary must be in the same language as the source content - Do NOT repeat ideas from previous snapshots unless conditions have materially changed -The output should be a maximum of 280 plain text characters.` +The output should be a maximum of 280 plain paragraphs.` type chatMessage struct { Role string `json:"role"` diff --git a/summary/openrouter.go b/summary/openrouter.go index d6cd87f1..56a06c07 100644 --- a/summary/openrouter.go +++ b/summary/openrouter.go @@ -46,14 +46,14 @@ func NewOpenRouter(c *http.Client, opts *config.Options) *OpenRouter { // Summarize generates a summary of the input text using OpenRouter's AI models. // Returns the generated summary as a string and an error, if any. -func (coh *OpenRouter) Summarize(s string) (string, error) { +func (or *OpenRouter) Summarize(s string) (string, error) { s = strings.TrimSpace(s) if s == "" { return "", fmt.Errorf("text not found") } body := chatRequest{ - Model: coh.model, + Model: or.model, Messages: []chatMessage{ {Role: "system", Content: systemPrompt}, {Role: "user", Content: s}, @@ -70,16 +70,16 @@ func (coh *OpenRouter) Summarize(s string) (string, error) { return "", fmt.Errorf("failed to make request: %v", err) } req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", "Bearer "+coh.apiKey) + req.Header.Set("Authorization", "Bearer "+or.apiKey) - res, err := coh.client.Do(req) + res, err := or.client.Do(req) if err != nil { return "", err } defer res.Body.Close() if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { - return "", fmt.Errorf("cohere api error: status %d", res.StatusCode) + return "", fmt.Errorf("openrouter api error: status %d", res.StatusCode) } var cr chatResponse diff --git a/summary/summary.go b/summary/summary.go index edf1da5d..90f4ac31 100644 --- a/summary/summary.go +++ b/summary/summary.go @@ -18,12 +18,6 @@ type Summarizer interface { Summarize(s string) (string, error) } -// Summary provides a high-level interface for generating text summaries using -// different summarization methods. -type Summary struct { - Handler interface{} -} - // NewSummary creates and returns a Summarizer based on the configured LLM provider. // It inspects opts.LLMProvider() (case-insensitive) and constructs a provider-specific // handler. It falls back to the legacy summarizer implementation. From a45498e6b38c6677b7ef15568206c2696cc1165b Mon Sep 17 00:00:00 2001 From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com> Date: Sun, 3 May 2026 14:55:55 +0000 Subject: [PATCH 8/8] Update prompt --- summary/chat.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/summary/chat.go b/summary/chat.go index eb477ac6..340be066 100644 --- a/summary/chat.go +++ b/summary/chat.go @@ -13,6 +13,12 @@ Rules: - Summary must be in the same language as the source content - Do NOT repeat ideas from previous snapshots unless conditions have materially changed +FORMATTING RULES (STRICT): +- STRICTOR PROHIBITION: Do not use Markdown bolding (**text**) +- Use ONLY plain text without any formatting +- Use simple line breaks to separate points +- Do NOT use headers or bold labels + The output should be a maximum of 280 plain paragraphs.` type chatMessage struct {