From eafb06271e0c5936ceef751d8cd98baa995a30ea Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Fri, 5 May 2023 09:37:10 +0100
Subject: [PATCH 1/8] Add support for AI-powered summarization
---
config/options.go | 8 +++
config/parser.go | 2 +
docs/environment.md | 1 +
go.mod | 4 +-
go.sum | 7 +-
reduxer/reduxer.go | 16 ++++-
summary/cohere.go | 54 +++++++++++++++
summary/cohere_test.go | 134 ++++++++++++++++++++++++++++++++++++
summary/doc.go | 9 +++
summary/summary.go | 12 ++++
template/render/discord.go | 2 +-
template/render/github.go | 2 +-
template/render/matrix.go | 2 +-
template/render/render.go | 40 ++++++++++-
template/render/slack.go | 2 +-
template/render/telegram.go | 2 +-
wayback.1 | 3 +
wayback.conf | 1 +
18 files changed, 291 insertions(+), 10 deletions(-)
create mode 100644 summary/cohere.go
create mode 100644 summary/cohere_test.go
create mode 100644 summary/doc.go
create mode 100644 summary/summary.go
diff --git a/config/options.go b/config/options.go
index 959877bf..ece12f82 100644
--- a/config/options.go
+++ b/config/options.go
@@ -93,6 +93,7 @@ const (
defBoltPathname = "wayback.db"
defPoolingSize = 3
defMaxMediaSize = "512MB"
+ defCohereApiKey = ""
defWaybackTimeout = 300
defWaybackMaxRetries = 2
defWaybackUserAgent = "WaybackArchiver/1.0"
@@ -156,6 +157,7 @@ type Options struct {
boltPathname string
maxMediaSize string
poolingSize int
+ cohereApiKey string
waybackTimeout int
waybackMaxRetries int
enabledChromeRemote bool
@@ -289,6 +291,7 @@ func NewOptions() *Options {
storageDir: defStorageDir,
maxMediaSize: defMaxMediaSize,
privacyURL: defPrivacyURL,
+ cohereApiKey: defCohereApiKey,
waybackTimeout: defWaybackTimeout,
waybackMaxRetries: defWaybackMaxRetries,
waybackUserAgent: defWaybackUserAgent,
@@ -951,6 +954,11 @@ func (o *Options) MaxMediaSize() uint64 {
return size
}
+// CohereApiKey returns the apikey of Cohere.
+func (o *Options) CohereApiKey() string {
+ return o.cohereApiKey
+}
+
// MaxAttachSize returns max attach size limits for several services.
// scope: telegram
func (o *Options) MaxAttachSize(scope string) int64 {
diff --git a/config/parser.go b/config/parser.go
index 76d233a0..00a00402 100644
--- a/config/parser.go
+++ b/config/parser.go
@@ -223,6 +223,8 @@ func (p *Parser) parseLines(lines []string) (err error) {
p.opts.storageDir = parseString(val, defStorageDir)
case "WAYBACK_MAX_MEDIA_SIZE":
p.opts.maxMediaSize = parseString(val, defMaxMediaSize)
+ case "WAYBACK_COHERE_APIKEY":
+ p.opts.cohereApiKey = parseString(val, defCohereApiKey)
case "WAYBACK_TIMEOUT":
p.opts.waybackTimeout = parseInt(val, defWaybackTimeout)
case "WAYBACK_MAX_RETRIES":
diff --git a/docs/environment.md b/docs/environment.md
index b7265e05..a2a15de1 100644
--- a/docs/environment.md
+++ b/docs/environment.md
@@ -32,6 +32,7 @@ Use the `-c` / `--config` option to specify the build definition file to use.
| - | `WAYBACK_BOLT_PATH` | `./wayback.db` | File path of bolt database |
| - | `WAYBACK_STORAGE_DIR` | - | Directory to store binary file, e.g. PDF, html file |
| - | `WAYBACK_MAX_MEDIA_SIZE` | `512MB` | Max size to limit download stream media |
+| - | `WAYBACK_COHERE_APIKEY` | `` | Cohere API key |
| - | `WAYBACK_MEDIA_SITES` | - | Extra media websites wish to be supported, separate with comma |
| - | `WAYBACK_TIMEOUT` | `300` | Timeout for single wayback request, defaults to 300 second |
| - | `WAYBACK_MAX_RETRIES` | `2` | Max retries for single wayback request, defaults to 2 |
diff --git a/go.mod b/go.mod
index 5ffdd307..c19d7d3a 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,7 @@ go 1.24.0
require (
github.com/PuerkitoBio/goquery v1.9.0
github.com/bwmarrin/discordgo v0.28.1
+ github.com/cohere-ai/cohere-go v0.2.0
github.com/cretz/bine v0.2.0
github.com/davecgh/go-spew v1.1.1
github.com/dghubble/go-twitter v0.0.0-20201011215211-4b180d0cc78d
@@ -82,11 +83,12 @@ require (
github.com/chromedp/chromedp v0.9.5 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/cloudflare/circl v1.3.7 // indirect
+ github.com/cohere-ai/tokenizer v1.1.2 // indirect
github.com/crackcomm/go-gitignore v0.0.0-20170627025303-887ab5e44cc3 // indirect
github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect
github.com/dghubble/sling v1.3.0 // indirect
- github.com/dlclark/regexp2 v1.7.0 // indirect
+ github.com/dlclark/regexp2 v1.9.0 // indirect
github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fortytw2/leaktest v1.3.0 // indirect
diff --git a/go.sum b/go.sum
index f3e8375c..eaca4e93 100644
--- a/go.sum
+++ b/go.sum
@@ -84,6 +84,10 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU=
github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA=
+github.com/cohere-ai/cohere-go v0.2.0 h1:Gljkn8LTtsAPy79ks1AVmZH9Av4kuQuXEgzEJ/1Ea34=
+github.com/cohere-ai/cohere-go v0.2.0/go.mod h1:DFcCu5rwro4wAlluIXY9l17NLGiVBGb2bRio46RXBm8=
+github.com/cohere-ai/tokenizer v1.1.2 h1:t3KwUBSpKiBVFtpnHBfVIQNmjfZUuqFVYuSFkZYOWpU=
+github.com/cohere-ai/tokenizer v1.1.2/go.mod h1:9MNFPd9j1fuiEK3ua2HSCUxxcrfGMlSqpa93livg/C0=
github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
@@ -119,8 +123,9 @@ github.com/dghubble/sling v1.3.0/go.mod h1:XXShWaBWKzNLhu2OxikSNFrlsvowtz4kyRuXU
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc=
-github.com/dlclark/regexp2 v1.7.0 h1:7lJfhqlPssTb1WQx4yvTHN0uElPEv52sbaECrAQxjAo=
github.com/dlclark/regexp2 v1.7.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI=
+github.com/dlclark/regexp2 v1.9.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/dop251/goja v0.0.0-20211022113120-dc8c55024d06/go.mod h1:R9ET47fwRVRPZnOGvHxxhuZcbrMCuiqOz3Rlrh4KSnk=
github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e h1:Uo51nR73BJlci20AE5tXT5qiLSGZy5LHnRlKt7VkcUM=
github.com/dop251/goja v0.0.0-20221115122301-6c0d9883792e/go.mod h1:yRkwfj0CBpOGre+TwBsqPV0IH0Pk73e4PXJOeNDboGs=
diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go
index 14cb6e09..b1f55634 100644
--- a/reduxer/reduxer.go
+++ b/reduxer/reduxer.go
@@ -27,6 +27,7 @@ import (
"github.com/wabarc/wayback/config"
"github.com/wabarc/wayback/errors"
"github.com/wabarc/wayback/ingress"
+ "github.com/wabarc/wayback/summary"
"golang.org/x/sync/errgroup"
)
@@ -57,6 +58,7 @@ type bundle struct {
shots *screenshot.Screenshots[screenshot.Path]
artifact Artifact
article readability.Article
+ summary string
}
// Artifact represents the file paths stored on the local disk.
@@ -135,6 +137,11 @@ func (b *bundle) Article() readability.Article {
return b.article
}
+// Summary returns a summary of article.
+func (b *bundle) Summary() string {
+ return b.summary
+}
+
// Do executes secreenshot, print PDF and export html of given URLs
// Returns a set of bundle containing screenshot data and file path
// nolint:gocyclo
@@ -221,11 +228,18 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e
if err = os.WriteFile(fp, helper.String2Byte(article.TextContent), filePerm); err == nil && article.TextContent != "" {
artifact.Txt.Local = fp
}
+
+ // Generate summary
+ sum := ""
+ if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil {
+ sum, _ = coh.Summarize(article.TextContent) // nolint:errcheck
+ }
+
// Upload files to third-party server
if err = remotely(ctx, artifact); err != nil {
logger.Error("upload files to remote server failed: %v", err)
}
- bundle := &bundle{shots: shot, artifact: *artifact, article: article}
+ bundle := &bundle{shots: shot, artifact: *artifact, article: article, summary: sum}
bs.Store(Src(shot.URL), bundle)
return nil
})
diff --git a/summary/cohere.go b/summary/cohere.go
new file mode 100644
index 00000000..fb9bcf4d
--- /dev/null
+++ b/summary/cohere.go
@@ -0,0 +1,54 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "fmt"
+ "net/http"
+ "strings"
+
+ "github.com/cohere-ai/cohere-go"
+)
+
+// Interface guard
+var _ Summarizer = (*Cohere)(nil)
+
+// Cohere represents a text summarization algorithm powered by Cohere's AI models.
+type Cohere struct {
+ client *cohere.Client
+}
+
+// NewCohere creates a `Cohere` instance with the specified `http.Client` instance and API key.
+// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer
+// to the newly created `Cohere` instance and an error, if any.
+func NewCohere(c *http.Client, key string) (*Cohere, error) {
+ coh, err := cohere.CreateClient(key)
+ if err != nil {
+ return nil, err
+ }
+ if c != nil {
+ coh.Client = *c
+ }
+
+ return &Cohere{coh}, nil
+}
+
+// Summarize generates a summary of the input text using Cohere's AI models.
+// Returns the generated summary as a string and an error, if any.
+func (coh *Cohere) Summarize(s string) (string, error) {
+ s = strings.TrimSpace(s)
+ if s == "" {
+ return "", fmt.Errorf("text not found")
+ }
+
+ res, err := coh.client.Summarize(cohere.SummarizeOptions{
+ Text: s,
+ })
+ if err != nil {
+ return "", err
+ }
+
+ return res.Summary, nil
+}
diff --git a/summary/cohere_test.go b/summary/cohere_test.go
new file mode 100644
index 00000000..de22e341
--- /dev/null
+++ b/summary/cohere_test.go
@@ -0,0 +1,134 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "fmt"
+ "net/http"
+ "os"
+ "reflect"
+ "testing"
+
+ "github.com/cohere-ai/cohere-go"
+ "github.com/wabarc/helper"
+)
+
+var (
+ apiKey = os.Getenv("COHERE_APIKEY")
+ summarized = "This is a summary of the test input."
+ summarizeResponse = []byte(fmt.Sprintf(`{
+ "summary": "%s"
+}`, summarized))
+
+ handleFunc = func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/json")
+ switch r.URL.Path {
+ case "/summarize":
+ w.Write(summarizeResponse)
+ }
+ }
+)
+
+func TestNewCohere(t *testing.T) {
+ if apiKey == "" {
+ t.Skip(`Must set env "COHERE_APIKEY"`)
+ }
+
+ httpClient, mux, server := helper.MockServer()
+ defer server.Close()
+
+ mux.HandleFunc("/", handleFunc)
+
+ tests := []struct {
+ desc string
+ client *http.Client
+ key string
+ expectErr bool
+ expectNil bool
+ }{
+ {
+ desc: "Valid inputs",
+ client: httpClient,
+ key: "valid_api_key",
+ expectErr: false,
+ expectNil: false,
+ },
+ {
+ desc: "Invalid API key",
+ client: httpClient,
+ key: apiKey,
+ expectErr: true,
+ expectNil: true,
+ },
+ {
+ desc: "Nil http.Client",
+ client: nil,
+ key: apiKey,
+ expectErr: false,
+ expectNil: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.desc, func(t *testing.T) {
+ cohere, err := NewCohere(tt.client, tt.key)
+ if tt.expectErr && err == nil {
+ t.Errorf("Expected error but got nil")
+ }
+ if tt.expectNil && cohere != nil {
+ t.Errorf("Expected nil value for Cohere instance")
+ }
+ if !tt.expectNil && cohere == nil {
+ t.Errorf("Unexpected nil value for Cohere instance")
+ }
+ })
+ }
+}
+
+func TestCohere_Summarize(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ expected string
+ expectedErr error
+ }{
+ {
+ name: "Empty string",
+ input: "",
+ expected: "",
+ expectedErr: fmt.Errorf("text not found"),
+ },
+ {
+ name: "Valid input",
+ input: "This is a test input for summarization.",
+ expected: summarized,
+ expectedErr: nil,
+ },
+ }
+
+ httpClient, mux, server := helper.MockServer()
+ defer server.Close()
+
+ mux.HandleFunc("/", handleFunc)
+
+ cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"}
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ coh := &Cohere{client: cohereClient}
+
+ // Call the Summarize method
+ actual, actualErr := coh.Summarize(tt.input)
+
+ // Check the results
+ if tt.expected != actual {
+ t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actual, tt.expected)
+ }
+ if !reflect.DeepEqual(tt.expectedErr, actualErr) {
+ t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actualErr, tt.expectedErr)
+ }
+ })
+ }
+}
diff --git a/summary/doc.go b/summary/doc.go
new file mode 100644
index 00000000..77056c34
--- /dev/null
+++ b/summary/doc.go
@@ -0,0 +1,9 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+/*
+Package summary is designed to provide a comprehensive set of tools for
+automated text summarization.
+*/
+package summary // import "github.com/wabarc/wayback/summary"
diff --git a/summary/summary.go b/summary/summary.go
new file mode 100644
index 00000000..3523b01f
--- /dev/null
+++ b/summary/summary.go
@@ -0,0 +1,12 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+// Summarizer is the interface that wraps the basic Summarize method.
+//
+// Summarize takes in a string of text and returns a summary.
+type Summarizer interface {
+ Summarize(s string) (string, error)
+}
diff --git a/template/render/discord.go b/template/render/discord.go
index 809ba788..b4ecfb54 100644
--- a/template/render/discord.go
+++ b/template/render/discord.go
@@ -59,7 +59,7 @@ func (d *Discord) ForPublish() (r *Render) {
tmplBytes.WriteString("\n\n")
}
- if dgst := Digest(d.Cols, d.Data); dgst != "" {
+ if dgst := summaryOrDigest(d.Cols, d.Data); dgst != "" {
tmplBytes.WriteString(dgst)
tmplBytes.WriteString("\n\n")
}
diff --git a/template/render/github.go b/template/render/github.go
index 81988f3a..28834bd9 100644
--- a/template/render/github.go
+++ b/template/render/github.go
@@ -33,7 +33,7 @@ func (gh *GitHub) ForReply() *Render {
func (gh *GitHub) ForPublish() *Render {
var tmplBytes bytes.Buffer
- if dgst := Digest(gh.Cols, gh.Data); dgst != "" {
+ if dgst := summaryOrDigest(gh.Cols, gh.Data); dgst != "" {
tmplBytes.WriteString(dgst)
tmplBytes.WriteString("\n\n")
}
diff --git a/template/render/matrix.go b/template/render/matrix.go
index a810408f..511c8ce7 100644
--- a/template/render/matrix.go
+++ b/template/render/matrix.go
@@ -66,7 +66,7 @@ func (m *Matrix) ForPublish() *Render {
tmplBytes.WriteString(` ›
`)
}
- if dgst := Digest(m.Cols, m.Data); dgst != "" {
+ if dgst := summaryOrDigest(m.Cols, m.Data); dgst != "" {
tmplBytes.WriteString(dgst)
tmplBytes.WriteString(`
`)
}
diff --git a/template/render/render.go b/template/render/render.go
index a45c9c85..71192217 100644
--- a/template/render/render.go
+++ b/template/render/render.go
@@ -157,8 +157,8 @@ func Title(cols []wayback.Collect, rdx reduxer.Reduxer) (title string) {
return
}
-// Digest returns digest of the webpage content. Its maximum length is defined by `maxDigestLen`.
-func Digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) {
+// digest returns digest of the webpage content. Its maximum length is defined by `maxDigestLen`.
+func digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) {
if rdx == nil {
return
}
@@ -185,6 +185,42 @@ func Digest(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) {
return
}
+// summary returns summary of the webpage content. Its maximum length is defined by `maxDigestLen`.
+func summary(cols []wayback.Collect, rdx reduxer.Reduxer) (dgst string) {
+ if rdx == nil {
+ return
+ }
+
+ for uri := range deDepURI(cols) {
+ if bundle, ok := rdx.Load(reduxer.Src(uri)); ok {
+ if text := bundle.Summary(); text != "" {
+ logger.Debug("extracted summary from article content: %s", text)
+ t := []rune(text)
+ l := len(t)
+ switch {
+ case l == 0:
+ continue
+ case l > maxDigestLen:
+ t = t[:maxDigestLen]
+ dgst += string(t) + ` ...`
+ default:
+ dgst += string(t)
+ }
+ }
+ }
+ }
+
+ return
+}
+
+func summaryOrDigest(cols []wayback.Collect, rdx reduxer.Reduxer) string {
+ if sum := summary(cols, rdx); sum != "" {
+ return sum
+ }
+
+ return digest(cols, rdx)
+}
+
// writeArtifact writes archived artifact of the webpage.
func writeArtifact(cols []wayback.Collect, rdx reduxer.Reduxer, fn func(art reduxer.Artifact)) {
if rdx == nil {
diff --git a/template/render/slack.go b/template/render/slack.go
index 880c56d4..4027fb08 100644
--- a/template/render/slack.go
+++ b/template/render/slack.go
@@ -61,7 +61,7 @@ func (s *Slack) ForPublish() (r *Render) {
tmplBytes.WriteString(" ›\n\n")
}
- if dgst := Digest(s.Cols, s.Data); dgst != "" {
+ if dgst := summaryOrDigest(s.Cols, s.Data); dgst != "" {
tmplBytes.WriteString(dgst)
tmplBytes.WriteString("\n\n")
}
diff --git a/template/render/telegram.go b/template/render/telegram.go
index c64bb30f..b7c94fea 100644
--- a/template/render/telegram.go
+++ b/template/render/telegram.go
@@ -69,7 +69,7 @@ func (t *Telegram) ForPublish() (r *Render) {
tmplBytes.WriteString("\n\n")
}
- if dgst := Digest(t.Cols, t.Data); dgst != "" {
+ if dgst := summaryOrDigest(t.Cols, t.Data); dgst != "" {
tmplBytes.WriteString(dgst)
tmplBytes.WriteString("\n\n")
}
diff --git a/wayback.1 b/wayback.1
index ae044242..d89cd1aa 100644
--- a/wayback.1
+++ b/wayback.1
@@ -224,6 +224,9 @@ Directory to store binary file, e.g. PDF, html file\&.
.B WAYBACK_MAX_MEDIA_SIZE
Max size to limit download stream media. default 512MB\&.
.TP
+.B WAYBACK_COHERE_APIKEY
+Cohere API key\&.
+.TP
.B WAYBACK_MEDIA_SITES
Extra media websites wish to be supported, separate with comma\&.
.TP
diff --git a/wayback.conf b/wayback.conf
index 57a2bdd4..b227e50c 100644
--- a/wayback.conf
+++ b/wayback.conf
@@ -76,6 +76,7 @@ WAYBACK_USERAGENT=WaybackArchiver/1.0
WAYBACK_FALLBACK=off
WAYBACK_PROXY=
WAYBACK_PRIVACY_URL=
+WAYBACK_COHERE_APIKEY=
# ipfs slot: infura, pinata
# doc: https://github.com/wabarc/ipfs-pinner#supported-pinning-services
From b996400700f352c4cbbaa1dbb3658acd35a1bb18 Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Tue, 9 May 2023 16:33:00 +0100
Subject: [PATCH 2/8] Generating summaries using a local algorithm
---
go.mod | 2 ++
go.sum | 11 ++++++
summary/locally.go | 49 +++++++++++++++++++++++++++
summary/locally_test.go | 55 ++++++++++++++++++++++++++++++
summary/summary.go | 32 ++++++++++++++++++
summary/summary_test.go | 75 +++++++++++++++++++++++++++++++++++++++++
6 files changed, 224 insertions(+)
create mode 100644 summary/locally.go
create mode 100644 summary/locally_test.go
create mode 100644 summary/summary_test.go
diff --git a/go.mod b/go.mod
index c19d7d3a..ecb46dbd 100644
--- a/go.mod
+++ b/go.mod
@@ -5,6 +5,7 @@ module github.com/wabarc/wayback
go 1.24.0
require (
+ github.com/JesusIslam/tldr v0.6.0
github.com/PuerkitoBio/goquery v1.9.0
github.com/bwmarrin/discordgo v0.28.1
github.com/cohere-ai/cohere-go v0.2.0
@@ -67,6 +68,7 @@ require (
github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 // indirect
github.com/SaveTheRbtz/generic-sync-map-go v0.0.0-20230201052002-6c5833b989be // indirect
github.com/VividCortex/ewma v1.2.0 // indirect
+ github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c // indirect
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/benbjohnson/clock v1.3.5 // indirect
diff --git a/go.sum b/go.sum
index eaca4e93..eb2d5781 100644
--- a/go.sum
+++ b/go.sum
@@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/JesusIslam/tldr v0.6.0 h1:b5jc9m77g9vs9iREKSitBWhyC6YdemtqjAqiCJycwt0=
+github.com/JesusIslam/tldr v0.6.0/go.mod h1:qnHomoqHP4q5qvOPggMBAnq7PB1V0CGF3+Dr4pcos74=
github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 h1:EtZwYyLbkEcIt+B//6sujwRCnHuTEK3qiSypAX5aJeM=
github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403/go.mod h1:mM6WvakkX2m+NgMiPCfFFjwfH4KzENC07zeGEqq9U7s=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
@@ -16,6 +18,8 @@ github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAU
github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c h1:UUHM6/UM34ESICar/DWOhLt2rqYabsvfjmupiY9z+iE=
+github.com/alixaxel/pagerank v0.0.0-20160306110729-14bfb4c1d88c/go.mod h1:e7Vic/xXDZAQ8ftWoLnVrXseAAvt54SVYrcirjCKcX0=
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
@@ -145,6 +149,8 @@ github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
+github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
+github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
@@ -356,6 +362,7 @@ github.com/multiformats/go-varint v0.0.7/go.mod h1:r8PUYw/fD/SjBCiKOoDlGF6QawOEL
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/nbd-wtf/go-nostr v0.17.1-0.20230426111250-32ca737acf77 h1:D7BdjjOD0D8r7RwLmrOTOJKEZ56D9YhLCEETz2Xh0Vo=
github.com/nbd-wtf/go-nostr v0.17.1-0.20230426111250-32ca737acf77/go.mod h1:YCDHJtaFQE76d1ZkcUsTkz3dYNP+bldo5CIQwXPPcbk=
+github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78=
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
github.com/oliamb/cutter v0.2.2 h1:Lfwkya0HHNU1YLnGv2hTkzHfasrSMkgv4Dn+5rmlk3k=
@@ -363,11 +370,14 @@ github.com/oliamb/cutter v0.2.2/go.mod h1:4BenG2/4GuRBDbVm/OPahDVqbrOemzpPiG5mi1
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk=
+github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA=
github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY=
github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
+github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
+github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
@@ -687,6 +697,7 @@ gopkg.in/sourcemap.v1 v1.0.5 h1:inv58fC9f9J3TK2Y2R1NPntXEn3/wjWHkonhIUODNTI=
gopkg.in/sourcemap.v1 v1.0.5/go.mod h1:2RlvNNSMglmRrcvhfuzp4hQHwOtjxlbjX7UPY/GXb78=
gopkg.in/telebot.v3 v3.0.0-20220130115853-f0291132d3c3 h1:ifpOmJCnVni31dBAw99qxgCRfD33ROgv7vYxuhu+iWc=
gopkg.in/telebot.v3 v3.0.0-20220130115853-f0291132d3c3/go.mod h1:7rExV8/0mDDNu9epSrDm/8j22KLaActH1Tbee6YjzWg=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/summary/locally.go b/summary/locally.go
new file mode 100644
index 00000000..4e48e1a8
--- /dev/null
+++ b/summary/locally.go
@@ -0,0 +1,49 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/JesusIslam/tldr"
+)
+
+const maxCharacters = 128
+
+// Interface guard
+var _ Summarizer = (*Locally)(nil)
+
+// Locally implements the Summarizer interface using the tldr.Bag package to
+// perform local summarization.
+type Locally struct {
+ *tldr.Bag
+}
+
+// NewLocally creates a new instance of the Locally struct with a new tldr.Bag instance.
+func NewLocally() *Locally {
+ return &Locally{tldr.New()}
+}
+
+// Summarize generates a summary of the input text using local summarization.
+// It returns the summary as a string and any error that occurred during summarization.
+func (l *Locally) Summarize(s string) (string, error) {
+ s = strings.TrimSpace(s)
+ if s == "" {
+ return "", fmt.Errorf("text not found")
+ }
+
+ l.Bag.MaxCharacters = maxCharacters
+ res, err := l.Bag.Summarize(s, 1)
+ if err != nil {
+ return "", fmt.Errorf("summarize failed: %v", err)
+ }
+
+ if len(res) == 0 {
+ return s, nil
+ }
+
+ return res[0], nil
+}
diff --git a/summary/locally_test.go b/summary/locally_test.go
new file mode 100644
index 00000000..da8f1d62
--- /dev/null
+++ b/summary/locally_test.go
@@ -0,0 +1,55 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "testing"
+)
+
+func TestLocally(t *testing.T) {
+ // Define test cases as a slice of structs.
+ tests := []struct {
+ name string
+ input string
+ want string
+ wantErr bool
+ errMessage string
+ }{
+ {
+ name: "valid input",
+ input: "This is a test string.",
+ want: "This is a test string.",
+ wantErr: false,
+ errMessage: "",
+ },
+ {
+ name: "empty input",
+ input: "",
+ want: "",
+ wantErr: true,
+ errMessage: "text not found",
+ },
+ }
+
+ local := NewLocally()
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got, err := local.Summarize(tt.input)
+
+ if (err != nil) != tt.wantErr {
+ t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr)
+ }
+
+ if tt.wantErr && err.Error() != tt.errMessage {
+ t.Fatalf(`Unexpected error message. Got "%v", but wanted "%v"`, err.Error(), tt.errMessage)
+ }
+
+ if !tt.wantErr && got != tt.want {
+ t.Fatalf(`Unexpected summary. Got "%v", but wanted "%v"`, got, tt.want)
+ }
+ })
+ }
+}
diff --git a/summary/summary.go b/summary/summary.go
index 3523b01f..87b3b5b3 100644
--- a/summary/summary.go
+++ b/summary/summary.go
@@ -4,9 +4,41 @@
package summary // import "github.com/wabarc/wayback/summary"
+import (
+ "fmt"
+ "strings"
+)
+
// Summarizer is the interface that wraps the basic Summarize method.
//
// Summarize takes in a string of text and returns a summary.
type Summarizer interface {
Summarize(s string) (string, error)
}
+
+// Interface guard
+var _ Summarizer = (*Summary)(nil)
+
+// Summary provides a high-level interface for generating text summaries using
+// different summarization methods.
+type Summary struct {
+ Handler interface{}
+}
+
+// Summarize generates a summary of the input text using the selected summarization method.
+// It returns the summary as a string and any error that occurred during summarization.
+func (sum *Summary) Summarize(s string) (string, error) {
+ s = strings.TrimSpace(s)
+ if s == "" {
+ return "", fmt.Errorf("text not found")
+ }
+
+ switch handler := sum.Handler.(type) {
+ case *Cohere:
+ return handler.Summarize(s)
+ case *Locally:
+ return handler.Summarize(s)
+ default:
+ return "", fmt.Errorf("invalid handler")
+ }
+}
diff --git a/summary/summary_test.go b/summary/summary_test.go
new file mode 100644
index 00000000..00d72c88
--- /dev/null
+++ b/summary/summary_test.go
@@ -0,0 +1,75 @@
+// Copyright 2023 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "testing"
+
+ "github.com/cohere-ai/cohere-go"
+ "github.com/wabarc/helper"
+)
+
+func TestSummarize(t *testing.T) {
+ httpClient, mux, server := helper.MockServer()
+ defer server.Close()
+
+ mux.HandleFunc("/", handleFunc)
+
+ cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"}
+ coh := &Cohere{client: cohereClient}
+
+ tests := []struct {
+ name string
+ handler interface{}
+ input string
+ wantErr bool
+ errMessage string
+ }{
+ {
+ name: "valid Cohere handler",
+ handler: coh,
+ input: "This is a test string.",
+ wantErr: false,
+ errMessage: "",
+ },
+ {
+ name: "valid Locally handler",
+ handler: NewLocally(),
+ input: "This is a test string.",
+ wantErr: false,
+ errMessage: "",
+ },
+ {
+ name: "invalid handler",
+ handler: "invalid-handler",
+ input: "This is a test string.",
+ wantErr: true,
+ errMessage: "invalid handler",
+ },
+ {
+ name: "empty input",
+ handler: coh,
+ input: "",
+ wantErr: true,
+ errMessage: "text not found",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ sum := &Summary{Handler: tt.handler}
+
+ _, err := sum.Summarize(tt.input)
+
+ if (err != nil) != tt.wantErr {
+ t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr)
+ }
+
+ if tt.wantErr && err.Error() != tt.errMessage {
+ t.Fatalf(`Unexpected error message. Got "%v", but wanted "%v"`, err.Error(), tt.errMessage)
+ }
+ })
+ }
+}
From 55986d6aa1b75d54c249592931c8560aa1e5b6d9 Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Tue, 9 May 2023 16:41:01 +0100
Subject: [PATCH 3/8] Locally-based summarization is the default
---
reduxer/reduxer.go | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go
index b1f55634..095fb103 100644
--- a/reduxer/reduxer.go
+++ b/reduxer/reduxer.go
@@ -230,10 +230,11 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e
}
// Generate summary
- sum := ""
+ tldr := &summary.Summary{Handler: summary.NewLocally()}
if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil {
- sum, _ = coh.Summarize(article.TextContent) // nolint:errcheck
+ tldr = &summary.Summary{Handler: coh}
}
+ sum, _ := tldr.Summarize(article.TextContent) // nolint:errcheck
// Upload files to third-party server
if err = remotely(ctx, artifact); err != nil {
From 33186b0f2e974fc18b36acb90951b8d8b947a515 Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Sun, 26 Apr 2026 04:06:02 +0000
Subject: [PATCH 4/8] Rename locally to legacy
---
reduxer/reduxer.go | 2 +-
summary/{locally.go => legacy.go} | 18 +++++++++---------
summary/{locally_test.go => legacy_test.go} | 4 ++--
summary/summary.go | 2 +-
summary/summary_test.go | 2 +-
5 files changed, 14 insertions(+), 14 deletions(-)
rename summary/{locally.go => legacy.go} (62%)
rename summary/{locally_test.go => legacy_test.go} (95%)
diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go
index 095fb103..31f297dc 100644
--- a/reduxer/reduxer.go
+++ b/reduxer/reduxer.go
@@ -230,7 +230,7 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e
}
// Generate summary
- tldr := &summary.Summary{Handler: summary.NewLocally()}
+ tldr := &summary.Summary{Handler: summary.NewLegacy()}
if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil {
tldr = &summary.Summary{Handler: coh}
}
diff --git a/summary/locally.go b/summary/legacy.go
similarity index 62%
rename from summary/locally.go
rename to summary/legacy.go
index 4e48e1a8..b69fed9f 100644
--- a/summary/locally.go
+++ b/summary/legacy.go
@@ -8,28 +8,28 @@ import (
"fmt"
"strings"
- "github.com/JesusIslam/tldr"
+ "github.com/didasy/tldr"
)
const maxCharacters = 128
// Interface guard
-var _ Summarizer = (*Locally)(nil)
+var _ Summarizer = (*Legacy)(nil)
-// Locally implements the Summarizer interface using the tldr.Bag package to
+// Legacy implements the Summarizer interface using the tldr.Bag package to
// perform local summarization.
-type Locally struct {
+type Legacy struct {
*tldr.Bag
}
-// NewLocally creates a new instance of the Locally struct with a new tldr.Bag instance.
-func NewLocally() *Locally {
- return &Locally{tldr.New()}
+// NewLegacy creates a new instance of the Legacy struct with a new tldr.Bag instance.
+func NewLegacy() *Legacy {
+ return &Legacy{tldr.New()}
}
-// Summarize generates a summary of the input text using local summarization.
+// Summarize generates a summary of the input text using legacy summarization.
// It returns the summary as a string and any error that occurred during summarization.
-func (l *Locally) Summarize(s string) (string, error) {
+func (l *Legacy) Summarize(s string) (string, error) {
s = strings.TrimSpace(s)
if s == "" {
return "", fmt.Errorf("text not found")
diff --git a/summary/locally_test.go b/summary/legacy_test.go
similarity index 95%
rename from summary/locally_test.go
rename to summary/legacy_test.go
index da8f1d62..2ea661a7 100644
--- a/summary/locally_test.go
+++ b/summary/legacy_test.go
@@ -8,7 +8,7 @@ import (
"testing"
)
-func TestLocally(t *testing.T) {
+func TestLegacy(t *testing.T) {
// Define test cases as a slice of structs.
tests := []struct {
name string
@@ -33,7 +33,7 @@ func TestLocally(t *testing.T) {
},
}
- local := NewLocally()
+ local := NewLegacy()
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
diff --git a/summary/summary.go b/summary/summary.go
index 87b3b5b3..552d6cf2 100644
--- a/summary/summary.go
+++ b/summary/summary.go
@@ -36,7 +36,7 @@ func (sum *Summary) Summarize(s string) (string, error) {
switch handler := sum.Handler.(type) {
case *Cohere:
return handler.Summarize(s)
- case *Locally:
+ case *Legacy:
return handler.Summarize(s)
default:
return "", fmt.Errorf("invalid handler")
diff --git a/summary/summary_test.go b/summary/summary_test.go
index 00d72c88..19566be4 100644
--- a/summary/summary_test.go
+++ b/summary/summary_test.go
@@ -36,7 +36,7 @@ func TestSummarize(t *testing.T) {
},
{
name: "valid Locally handler",
- handler: NewLocally(),
+ handler: NewLegacy(),
input: "This is a test string.",
wantErr: false,
errMessage: "",
From 01c9450bb47394ead17997756f82c292f8007490 Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Sun, 26 Apr 2026 14:43:16 +0000
Subject: [PATCH 5/8] Add openrouter provider
---
config/options.go | 34 ++++++-
config/parser.go | 8 +-
docs/environment.md | 4 +-
go.mod | 4 +-
go.sum | 8 +-
reduxer/reduxer.go | 8 +-
summary/chat.go | 44 +++++++++
summary/cohere.go | 68 +++++++++++---
summary/cohere_test.go | 122 ++++++++++++++++++-------
summary/openrouter.go | 95 +++++++++++++++++++
summary/openrouter_test.go | 183 +++++++++++++++++++++++++++++++++++++
summary/summary.go | 32 +++----
summary/summary_test.go | 33 ++++---
wayback.1 | 12 ++-
wayback.conf | 4 +-
15 files changed, 556 insertions(+), 103 deletions(-)
create mode 100644 summary/chat.go
create mode 100644 summary/openrouter.go
create mode 100644 summary/openrouter_test.go
diff --git a/config/options.go b/config/options.go
index ece12f82..9c9ca3d3 100644
--- a/config/options.go
+++ b/config/options.go
@@ -113,6 +113,10 @@ const (
defDatabaseMinConns = 1
defDatabaseConnectionLifetime = 5
+ defLLMProvider = ""
+ defLLMApiKey = ""
+ defLLMModel = ""
+
maxAttachSizeTelegram = 50000000 // 50MB
maxAttachSizeDiscord = 8000000 // 8MB
maxAttachSizeSlack = 5000000000 // 5GB
@@ -146,6 +150,7 @@ type Options struct {
notion *notion
matrix *matrix
slack *slack
+ llm *llm
services sync.Map
privacyURL string
storageDir string
@@ -157,7 +162,6 @@ type Options struct {
boltPathname string
maxMediaSize string
poolingSize int
- cohereApiKey string
waybackTimeout int
waybackMaxRetries int
enabledChromeRemote bool
@@ -271,6 +275,12 @@ type meili struct {
apikey string
}
+type llm struct {
+ provider string
+ apikey string
+ model string
+}
+
type omnivore struct {
apikey string
}
@@ -291,7 +301,6 @@ func NewOptions() *Options {
storageDir: defStorageDir,
maxMediaSize: defMaxMediaSize,
privacyURL: defPrivacyURL,
- cohereApiKey: defCohereApiKey,
waybackTimeout: defWaybackTimeout,
waybackMaxRetries: defWaybackMaxRetries,
waybackUserAgent: defWaybackUserAgent,
@@ -389,6 +398,11 @@ func NewOptions() *Options {
indexing: defMeiliIndexing,
apikey: defMeiliApikey,
},
+ llm: &llm{
+ provider: defLLMProvider,
+ apikey: defLLMApiKey,
+ model: defLLMModel,
+ },
omnivore: &omnivore{
apikey: defOmnivoreApikey,
},
@@ -954,9 +968,19 @@ func (o *Options) MaxMediaSize() uint64 {
return size
}
-// CohereApiKey returns the apikey of Cohere.
-func (o *Options) CohereApiKey() string {
- return o.cohereApiKey
+// LLMProvider returns the LLM provider.
+func (o *Options) LLMProvider() string {
+ return o.llm.provider
+}
+
+// LLMApiKey returns the apikey of LLM provider.
+func (o *Options) LLMApiKey() string {
+ return o.llm.apikey
+}
+
+// LLMModel returns the model of LLM provider.
+func (o *Options) LLMModel() string {
+ return o.llm.model
}
// MaxAttachSize returns max attach size limits for several services.
diff --git a/config/parser.go b/config/parser.go
index 00a00402..0d3df878 100644
--- a/config/parser.go
+++ b/config/parser.go
@@ -223,8 +223,6 @@ func (p *Parser) parseLines(lines []string) (err error) {
p.opts.storageDir = parseString(val, defStorageDir)
case "WAYBACK_MAX_MEDIA_SIZE":
p.opts.maxMediaSize = parseString(val, defMaxMediaSize)
- case "WAYBACK_COHERE_APIKEY":
- p.opts.cohereApiKey = parseString(val, defCohereApiKey)
case "WAYBACK_TIMEOUT":
p.opts.waybackTimeout = parseInt(val, defWaybackTimeout)
case "WAYBACK_MAX_RETRIES":
@@ -239,6 +237,12 @@ func (p *Parser) parseLines(lines []string) (err error) {
p.opts.meili.indexing = parseString(val, defMeiliIndexing)
case "WAYBACK_MEILI_APIKEY":
p.opts.meili.apikey = parseString(val, defMeiliApikey)
+ case "WAYBACK_LLM_PROVIDER":
+ p.opts.llm.provider = parseString(val, defLLMProvider)
+ case "WAYBACK_LLM_APIKEY":
+ p.opts.llm.apikey = parseString(val, defLLMApiKey)
+ case "WAYBACK_LLM_MODEL":
+ p.opts.llm.model = parseString(val, defLLMModel)
case "WAYBACK_OMNIVORE_APIKEY":
p.opts.omnivore.apikey = parseString(val, defOmnivoreApikey)
case "WAYBACK_PRIVACY_URL":
diff --git a/docs/environment.md b/docs/environment.md
index a2a15de1..ef15eb7f 100644
--- a/docs/environment.md
+++ b/docs/environment.md
@@ -32,7 +32,6 @@ Use the `-c` / `--config` option to specify the build definition file to use.
| - | `WAYBACK_BOLT_PATH` | `./wayback.db` | File path of bolt database |
| - | `WAYBACK_STORAGE_DIR` | - | Directory to store binary file, e.g. PDF, html file |
| - | `WAYBACK_MAX_MEDIA_SIZE` | `512MB` | Max size to limit download stream media |
-| - | `WAYBACK_COHERE_APIKEY` | `` | Cohere API key |
| - | `WAYBACK_MEDIA_SITES` | - | Extra media websites wish to be supported, separate with comma |
| - | `WAYBACK_TIMEOUT` | `300` | Timeout for single wayback request, defaults to 300 second |
| - | `WAYBACK_MAX_RETRIES` | `2` | Max retries for single wayback request, defaults to 2 |
@@ -103,6 +102,9 @@ Use the `-c` / `--config` option to specify the build definition file to use.
| - | `WAYBACK_ONION_LOCAL_PORT` | `8964` | Local port for Tor Hidden Service, also support for a **reverse proxy**. This is ignored if `WAYBACK_LISTEN_ADDR` is set. |
| - | `WAYBACK_ONION_REMOTE_PORTS` | `80` | Remote ports for Tor Hidden Service, e.g. `WAYBACK_ONION_REMOTE_PORTS=80,81` |
| - | `WAYBACK_ONION_DISABLED` | `false` | Disable onion service |
+| - | `WAYBACK_LLM_PROVIDER` | `` | Enables AI-enhanced summary |
+| - | `WAYBACK_LLM_APIKEY` | `` | LLM API key |
+| - | `WAYBACK_LLM_MODEL` | `` | LLM model. Each provider has a sensible default: cohere: command-a-03-2025 \| openrouter: openrouter/auto. |
| - | `WAYBACK_SLOT` | - | Pinning service for IPFS mode of pinner, see [ipfs-pinner](https://github.com/wabarc/ipfs-pinner#supported-pinning-services) |
| - | `WAYBACK_APIKEY` | - | API key for pinning service |
| - | `WAYBACK_SECRET` | - | API secret for pinning service |
diff --git a/go.mod b/go.mod
index ecb46dbd..8ccca4b3 100644
--- a/go.mod
+++ b/go.mod
@@ -5,14 +5,13 @@ module github.com/wabarc/wayback
go 1.24.0
require (
- github.com/JesusIslam/tldr v0.6.0
github.com/PuerkitoBio/goquery v1.9.0
github.com/bwmarrin/discordgo v0.28.1
- github.com/cohere-ai/cohere-go v0.2.0
github.com/cretz/bine v0.2.0
github.com/davecgh/go-spew v1.1.1
github.com/dghubble/go-twitter v0.0.0-20201011215211-4b180d0cc78d
github.com/dghubble/oauth1 v0.7.1
+ github.com/didasy/tldr v0.7.0
github.com/dstotijn/go-notion v0.11.0
github.com/dustin/go-humanize v1.0.0
github.com/gabriel-vasile/mimetype v1.4.2
@@ -85,7 +84,6 @@ require (
github.com/chromedp/chromedp v0.9.5 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/cloudflare/circl v1.3.7 // indirect
- github.com/cohere-ai/tokenizer v1.1.2 // indirect
github.com/crackcomm/go-gitignore v0.0.0-20170627025303-887ab5e44cc3 // indirect
github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect
diff --git a/go.sum b/go.sum
index eb2d5781..7ffbe835 100644
--- a/go.sum
+++ b/go.sum
@@ -2,8 +2,6 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/JesusIslam/tldr v0.6.0 h1:b5jc9m77g9vs9iREKSitBWhyC6YdemtqjAqiCJycwt0=
-github.com/JesusIslam/tldr v0.6.0/go.mod h1:qnHomoqHP4q5qvOPggMBAnq7PB1V0CGF3+Dr4pcos74=
github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403 h1:EtZwYyLbkEcIt+B//6sujwRCnHuTEK3qiSypAX5aJeM=
github.com/MercuryEngineering/CookieMonster v0.0.0-20180304172713-1584578b3403/go.mod h1:mM6WvakkX2m+NgMiPCfFFjwfH4KzENC07zeGEqq9U7s=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
@@ -88,10 +86,6 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU=
github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA=
-github.com/cohere-ai/cohere-go v0.2.0 h1:Gljkn8LTtsAPy79ks1AVmZH9Av4kuQuXEgzEJ/1Ea34=
-github.com/cohere-ai/cohere-go v0.2.0/go.mod h1:DFcCu5rwro4wAlluIXY9l17NLGiVBGb2bRio46RXBm8=
-github.com/cohere-ai/tokenizer v1.1.2 h1:t3KwUBSpKiBVFtpnHBfVIQNmjfZUuqFVYuSFkZYOWpU=
-github.com/cohere-ai/tokenizer v1.1.2/go.mod h1:9MNFPd9j1fuiEK3ua2HSCUxxcrfGMlSqpa93livg/C0=
github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
@@ -126,6 +120,8 @@ github.com/dghubble/sling v1.3.0 h1:pZHjCJq4zJvc6qVQ5wN1jo5oNZlNE0+8T/h0XeXBUKU=
github.com/dghubble/sling v1.3.0/go.mod h1:XXShWaBWKzNLhu2OxikSNFrlsvowtz4kyRuXUG7oQKY=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
+github.com/didasy/tldr v0.7.0 h1:9kFLpmeGeGPPIRysln8B9USbW+L5zAAlw9ol8gwc2gU=
+github.com/didasy/tldr v0.7.0/go.mod h1:1W7p626SAyEeSkAAzFJLAG/Hr6imK7sxEr+K6x7e7Ao=
github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc=
github.com/dlclark/regexp2 v1.7.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI=
diff --git a/reduxer/reduxer.go b/reduxer/reduxer.go
index 31f297dc..dd6c2cb5 100644
--- a/reduxer/reduxer.go
+++ b/reduxer/reduxer.go
@@ -230,11 +230,11 @@ func Do(ctx context.Context, opts *config.Options, urls ...*url.URL) (Reduxer, e
}
// Generate summary
- tldr := &summary.Summary{Handler: summary.NewLegacy()}
- if coh, err := summary.NewCohere(ingress.Client(), opts.CohereApiKey()); err == nil {
- tldr = &summary.Summary{Handler: coh}
+ summarizer := summary.NewSummary(opts)
+ sum, err := summarizer.Summarize(article.TextContent)
+ if err != nil {
+ logger.Error("sumarize failed: %v", err)
}
- sum, _ := tldr.Summarize(article.TextContent) // nolint:errcheck
// Upload files to third-party server
if err = remotely(ctx, artifact); err != nil {
diff --git a/summary/chat.go b/summary/chat.go
new file mode 100644
index 00000000..2f913479
--- /dev/null
+++ b/summary/chat.go
@@ -0,0 +1,44 @@
+// Copyright 2026 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+const systemPrompt = `You are a digital archivist and information synthesiser, your expertise lies in distilling "noise" from legacy web data into high-signal summaries.
+
+
+Rules:
+- Summary point must be anchored by specific verbatim quotes
+- Ignore UI elements (navbars, footers) and focus on the core content
+- Be objective, clinical, and precise. Strip away marketing fluff to reveal the underlying data
+- Summary must be in the same language as the source content
+- Do NOT repeat ideas from previous snapshots unless conditions have materially changed
+
+The output should be a maximum of 280 plain text characters.`
+
+type chatMessage struct {
+ Role string `json:"role"`
+ Content string `json:"content"`
+}
+
+type chatRequest struct {
+ Messages []chatMessage `json:"messages"`
+ Model string `json:"model"`
+}
+
+type chatContent struct {
+ Type string
+ Text string
+}
+
+type chatChoice struct {
+ Contents []chatContent `json:"content"`
+ Message chatMessage `json:"message,omitempty"`
+ Role string `json:"role"`
+}
+
+type chatResponse struct {
+ Message chatChoice `json:"message,omitempty"`
+ Choices []chatChoice `json:"choices,omitempty"`
+ ID string `json:"id"`
+}
diff --git a/summary/cohere.go b/summary/cohere.go
index fb9bcf4d..0149273f 100644
--- a/summary/cohere.go
+++ b/summary/cohere.go
@@ -5,11 +5,14 @@
package summary // import "github.com/wabarc/wayback/summary"
import (
+ "bytes"
+ "encoding/json"
"fmt"
"net/http"
"strings"
- "github.com/cohere-ai/cohere-go"
+ "github.com/wabarc/wayback/config"
+ "github.com/wabarc/wayback/ingress"
)
// Interface guard
@@ -17,22 +20,28 @@ var _ Summarizer = (*Cohere)(nil)
// Cohere represents a text summarization algorithm powered by Cohere's AI models.
type Cohere struct {
- client *cohere.Client
+ client *http.Client
+ apiKey string
+ model string
}
// NewCohere creates a `Cohere` instance with the specified `http.Client` instance and API key.
// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer
// to the newly created `Cohere` instance and an error, if any.
-func NewCohere(c *http.Client, key string) (*Cohere, error) {
- coh, err := cohere.CreateClient(key)
- if err != nil {
- return nil, err
+func NewCohere(c *http.Client, opts *config.Options) *Cohere {
+ if c == nil {
+ c = ingress.Client()
}
- if c != nil {
- coh.Client = *c
+ model := opts.LLMModel()
+ if model == "" {
+ model = "command-a-03-2025"
}
- return &Cohere{coh}, nil
+ return &Cohere{
+ client: c,
+ apiKey: opts.LLMApiKey(),
+ model: model,
+ }
}
// Summarize generates a summary of the input text using Cohere's AI models.
@@ -43,12 +52,45 @@ func (coh *Cohere) Summarize(s string) (string, error) {
return "", fmt.Errorf("text not found")
}
- res, err := coh.client.Summarize(cohere.SummarizeOptions{
- Text: s,
- })
+ body := chatRequest{
+ Model: coh.model,
+ Messages: []chatMessage{
+ {Role: "system", Content: systemPrompt},
+ {Role: "user", Content: s},
+ },
+ }
+ buf, err := json.Marshal(body)
+ if err != nil {
+ return "", fmt.Errorf("failed to marshal json: %v", err)
+ }
+
+ endpoint := "https://api.cohere.ai/v2/chat"
+ req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(buf))
+ if err != nil {
+ return "", fmt.Errorf("failed to make request: %v", err)
+ }
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
+ req.Header.Set("Authorization", "Bearer "+coh.apiKey)
+
+ res, err := coh.client.Do(req)
if err != nil {
return "", err
}
+ defer res.Body.Close()
+
+ if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices {
+ return "", fmt.Errorf("cohere api error: status %d", res.StatusCode)
+ }
+
+ var cr chatResponse
+ if err := json.NewDecoder(res.Body).Decode(&cr); err != nil {
+ return "", fmt.Errorf("failed to decode body: %v", err)
+ }
+
+ if len(cr.Message.Contents) > 0 && strings.TrimSpace(cr.Message.Contents[0].Text) != "" {
+ return strings.TrimSpace(cr.Message.Contents[0].Text), nil
+ }
- return res.Summary, nil
+ return s, nil
}
diff --git a/summary/cohere_test.go b/summary/cohere_test.go
index de22e341..67258b0d 100644
--- a/summary/cohere_test.go
+++ b/summary/cohere_test.go
@@ -5,18 +5,19 @@
package summary // import "github.com/wabarc/wayback/summary"
import (
+ "encoding/json"
"fmt"
"net/http"
"os"
- "reflect"
+ "strings"
"testing"
- "github.com/cohere-ai/cohere-go"
"github.com/wabarc/helper"
+ "github.com/wabarc/wayback/config"
)
var (
- apiKey = os.Getenv("COHERE_APIKEY")
+ apiKey = os.Getenv("WAYBACK_LLM_APIKEY")
summarized = "This is a summary of the test input."
summarizeResponse = []byte(fmt.Sprintf(`{
"summary": "%s"
@@ -25,17 +26,13 @@ var (
handleFunc = func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
switch r.URL.Path {
- case "/summarize":
+ case "/v2/chat":
w.Write(summarizeResponse)
}
}
)
func TestNewCohere(t *testing.T) {
- if apiKey == "" {
- t.Skip(`Must set env "COHERE_APIKEY"`)
- }
-
httpClient, mux, server := helper.MockServer()
defer server.Close()
@@ -73,13 +70,16 @@ func TestNewCohere(t *testing.T) {
for _, tt := range tests {
t.Run(tt.desc, func(t *testing.T) {
- cohere, err := NewCohere(tt.client, tt.key)
- if tt.expectErr && err == nil {
- t.Errorf("Expected error but got nil")
- }
- if tt.expectNil && cohere != nil {
- t.Errorf("Expected nil value for Cohere instance")
+ t.Setenv("WAYBACK_LLM_PROVIDER", "cohere")
+ t.Setenv("WAYBACK_LLM_APIKEY", tt.key)
+
+ parser := config.NewParser()
+ opts, err := parser.ParseEnvironmentVariables()
+ if err != nil {
+ t.Fatalf("Parse environment variables or flags failed, error: %v", err)
}
+
+ cohere := NewCohere(tt.client, opts)
if !tt.expectNil && cohere == nil {
t.Errorf("Unexpected nil value for Cohere instance")
}
@@ -87,47 +87,107 @@ func TestNewCohere(t *testing.T) {
}
}
-func TestCohere_Summarize(t *testing.T) {
+func TestCohereSummarize(t *testing.T) {
tests := []struct {
name string
input string
+ mockStatus int
+ mockBody string
expected string
- expectedErr error
+ expectedErr string
}{
{
name: "Empty string",
input: "",
expected: "",
- expectedErr: fmt.Errorf("text not found"),
+ expectedErr: "text not found",
},
{
- name: "Valid input",
- input: "This is a test input for summarization.",
- expected: summarized,
- expectedErr: nil,
+ name: "Valid input",
+ input: "This is a test input for summarization.",
+ mockStatus: 200,
+ mockBody: `{
+ "messages":[
+ {"role":"user","content":"This is the summary."}
+ ]
+ }`,
+ expected: "This is the summary.",
+ expectedErr: "",
+ },
+ {
+ name: "API error status",
+ input: "Non-empty",
+ mockStatus: 500,
+ mockBody: `{"error":"server"}`,
+ expected: "",
+ expectedErr: "cohere api error: status 500",
},
}
httpClient, mux, server := helper.MockServer()
defer server.Close()
- mux.HandleFunc("/", handleFunc)
-
- cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"}
+ // Register handler at expected endpoint path used by the client.
+ mux.HandleFunc("/v2/chat", func(w http.ResponseWriter, r *http.Request) {
+ // optional: assert method and headers
+ if r.Method != http.MethodPost {
+ http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+ // Find matching test case by inspecting body or rely on sequential handling.
+ // For simplicity, read body and decide response based on test inputs:
+ var req struct {
+ Messages []struct {
+ Content string `json:"content"`
+ } `json:"messages"`
+ }
+ _ = json.NewDecoder(r.Body).Decode(&req)
+ r.Body.Close()
+
+ switch {
+ case strings.Contains(req.Messages[1].Content, "This is a test input for summarization."):
+ w.WriteHeader(200)
+ w.Write([]byte(`{"messages":[{"role":"assistant","content":"This is the summary."}]}`))
+ case strings.Contains(req.Messages[1].Content, "Non-empty"):
+ w.WriteHeader(500)
+ w.Write([]byte("server error"))
+ default:
+ // default success
+ w.WriteHeader(200)
+ w.Write([]byte(`{"messages":[{"role":"assistant","content":"ok"}]}`))
+ }
+ })
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- coh := &Cohere{client: cohereClient}
+ t.Setenv("WAYBACK_LLM_PROVIDER", "cohere")
+ t.Setenv("WAYBACK_LLM_APIKEY", "test-key")
+
+ parser := config.NewParser()
+ opts, err := parser.ParseEnvironmentVariables()
+ if err != nil {
+ t.Fatalf("Parse environment variables or flags failed, error: %v", err)
+ }
+
+ coh := NewCohere(httpClient, opts)
- // Call the Summarize method
actual, actualErr := coh.Summarize(tt.input)
- // Check the results
- if tt.expected != actual {
- t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actual, tt.expected)
+ if tt.expectedErr != "" {
+ if actualErr == nil {
+ t.Fatalf("expected error %q, got nil", tt.expectedErr)
+ }
+ if actualErr.Error() != tt.expectedErr {
+ t.Fatalf("unexpected error, got %q expected %q", actualErr.Error(), tt.expectedErr)
+ }
+ return
+ }
+
+ if actualErr != nil {
+ t.Fatalf("unexpected error: %v", actualErr)
}
- if !reflect.DeepEqual(tt.expectedErr, actualErr) {
- t.Fatalf(`unexpected summarize, got "%v" instead of "%v"`, actualErr, tt.expectedErr)
+ if actual != tt.expected {
+ t.Fatalf(`unexpected summary, got "%v" instead of "%v"`, actual, tt.expected)
}
})
}
diff --git a/summary/openrouter.go b/summary/openrouter.go
new file mode 100644
index 00000000..d6cd87f1
--- /dev/null
+++ b/summary/openrouter.go
@@ -0,0 +1,95 @@
+// Copyright 2026 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "strings"
+
+ "github.com/wabarc/wayback/config"
+ "github.com/wabarc/wayback/ingress"
+)
+
+// Interface guard
+var _ Summarizer = (*OpenRouter)(nil)
+
+// OpenRouter represents a text summarization client for OpenRouter LLM service.
+type OpenRouter struct {
+ client *http.Client
+ apiKey string
+ model string
+}
+
+// NewOpenRouter creates a `OpenRouter` instance with the specified `http.Client` and options.
+// If the `http.Client` instance is `nil`, the default client is used. This function returns a pointer
+// to the newly created `OpenRouter` instance and an error, if any.
+func NewOpenRouter(c *http.Client, opts *config.Options) *OpenRouter {
+ if c == nil {
+ c = ingress.Client()
+ }
+ model := opts.LLMModel()
+ if model == "" {
+ model = "openrouter/auto"
+ }
+
+ return &OpenRouter{
+ client: c,
+ apiKey: opts.LLMApiKey(),
+ model: model,
+ }
+}
+
+// Summarize generates a summary of the input text using OpenRouter's AI models.
+// Returns the generated summary as a string and an error, if any.
+func (coh *OpenRouter) Summarize(s string) (string, error) {
+ s = strings.TrimSpace(s)
+ if s == "" {
+ return "", fmt.Errorf("text not found")
+ }
+
+ body := chatRequest{
+ Model: coh.model,
+ Messages: []chatMessage{
+ {Role: "system", Content: systemPrompt},
+ {Role: "user", Content: s},
+ },
+ }
+ buf, err := json.Marshal(body)
+ if err != nil {
+ return "", fmt.Errorf("failed to marshal json: %v", err)
+ }
+
+ endpoint := "https://openrouter.ai/api/v1/chat/completions"
+ req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(buf))
+ if err != nil {
+ return "", fmt.Errorf("failed to make request: %v", err)
+ }
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Authorization", "Bearer "+coh.apiKey)
+
+ res, err := coh.client.Do(req)
+ if err != nil {
+ return "", err
+ }
+ defer res.Body.Close()
+
+ if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices {
+ return "", fmt.Errorf("cohere api error: status %d", res.StatusCode)
+ }
+
+ var cr chatResponse
+ if err := json.NewDecoder(res.Body).Decode(&cr); err != nil {
+ return "", fmt.Errorf("failed to decode body: %v", err)
+ }
+
+ if len(cr.Choices) > 0 && strings.TrimSpace(cr.Choices[0].Message.Content) != "" {
+ return strings.TrimSpace(cr.Choices[0].Message.Content), nil
+ }
+
+ return s, nil
+}
diff --git a/summary/openrouter_test.go b/summary/openrouter_test.go
new file mode 100644
index 00000000..0b262735
--- /dev/null
+++ b/summary/openrouter_test.go
@@ -0,0 +1,183 @@
+// Copyright 2026 Wayback Archiver. All rights reserved.
+// Use of this source code is governed by the GNU GPL v3
+// license that can be found in the LICENSE file.
+
+package summary // import "github.com/wabarc/wayback/summary"
+
+import (
+ "encoding/json"
+ "net/http"
+ "strings"
+ "testing"
+
+ "github.com/wabarc/helper"
+ "github.com/wabarc/wayback/config"
+)
+
+func TestNewOpenRouter(t *testing.T) {
+ httpClient, mux, server := helper.MockServer()
+ defer server.Close()
+
+ handleFunc := func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/json")
+ switch r.URL.Path {
+ case "/api/v1/chat/completions":
+ w.Write(summarizeResponse)
+ }
+ }
+ mux.HandleFunc("/", handleFunc)
+
+ tests := []struct {
+ desc string
+ client *http.Client
+ key string
+ expectErr bool
+ expectNil bool
+ }{
+ {
+ desc: "Valid inputs",
+ client: httpClient,
+ key: "valid_api_key",
+ expectErr: false,
+ expectNil: false,
+ },
+ {
+ desc: "Invalid API key",
+ client: httpClient,
+ key: apiKey,
+ expectErr: true,
+ expectNil: true,
+ },
+ {
+ desc: "Nil http.Client",
+ client: nil,
+ key: apiKey,
+ expectErr: false,
+ expectNil: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.desc, func(t *testing.T) {
+ t.Setenv("WAYBACK_LLM_PROVIDER", "cohere")
+ t.Setenv("WAYBACK_LLM_APIKEY", tt.key)
+
+ parser := config.NewParser()
+ opts, err := parser.ParseEnvironmentVariables()
+ if err != nil {
+ t.Fatalf("Parse environment variables or flags failed, error: %v", err)
+ }
+
+ cohere := NewOpenRouter(tt.client, opts)
+ if !tt.expectNil && cohere == nil {
+ t.Errorf("Unexpected nil value for OpenRouter instance")
+ }
+ })
+ }
+}
+
+func TestOpenRouterSummarize(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ mockStatus int
+ mockBody string
+ expected string
+ expectedErr string
+ }{
+ {
+ name: "Empty string",
+ input: "",
+ expected: "",
+ expectedErr: "text not found",
+ },
+ {
+ name: "Valid input",
+ input: "This is a test input for summarization.",
+ mockStatus: 200,
+ mockBody: `{
+ "messages":[
+ {"role":"user","content":"This is the summary."}
+ ]
+ }`,
+ expected: "This is the summary.",
+ expectedErr: "",
+ },
+ {
+ name: "API error status",
+ input: "Non-empty",
+ mockStatus: 500,
+ mockBody: `{"error":"server"}`,
+ expected: "",
+ expectedErr: "cohere api error: status 500",
+ },
+ }
+
+ httpClient, mux, server := helper.MockServer()
+ defer server.Close()
+
+ // Register handler at expected endpoint path used by the client.
+ mux.HandleFunc("/api/v1/chat/completions", func(w http.ResponseWriter, r *http.Request) {
+ // optional: assert method and headers
+ if r.Method != http.MethodPost {
+ http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+ // Find matching test case by inspecting body or rely on sequential handling.
+ // For simplicity, read body and decide response based on test inputs:
+ var req struct {
+ Messages []struct {
+ Content string `json:"content"`
+ } `json:"messages"`
+ }
+ _ = json.NewDecoder(r.Body).Decode(&req)
+ r.Body.Close()
+
+ switch {
+ case strings.Contains(req.Messages[1].Content, "This is a test input for summarization."):
+ w.WriteHeader(200)
+ w.Write([]byte(`{"messages":[{"role":"assistant","content":"This is the summary."}]}`))
+ case strings.Contains(req.Messages[1].Content, "Non-empty"):
+ w.WriteHeader(500)
+ w.Write([]byte("server error"))
+ default:
+ // default success
+ w.WriteHeader(200)
+ w.Write([]byte(`{"messages":[{"role":"assistant","content":"ok"}]}`))
+ }
+ })
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Setenv("WAYBACK_LLM_PROVIDER", "cohere")
+ t.Setenv("WAYBACK_LLM_APIKEY", "test-key")
+
+ parser := config.NewParser()
+ opts, err := parser.ParseEnvironmentVariables()
+ if err != nil {
+ t.Fatalf("Parse environment variables or flags failed, error: %v", err)
+ }
+
+ coh := NewOpenRouter(httpClient, opts)
+
+ actual, actualErr := coh.Summarize(tt.input)
+
+ if tt.expectedErr != "" {
+ if actualErr == nil {
+ t.Fatalf("expected error %q, got nil", tt.expectedErr)
+ }
+ if actualErr.Error() != tt.expectedErr {
+ t.Fatalf("unexpected error, got %q expected %q", actualErr.Error(), tt.expectedErr)
+ }
+ return
+ }
+
+ if actualErr != nil {
+ t.Fatalf("unexpected error: %v", actualErr)
+ }
+ if actual != tt.expected {
+ t.Fatalf(`unexpected summary, got "%v" instead of "%v"`, actual, tt.expected)
+ }
+ })
+ }
+}
diff --git a/summary/summary.go b/summary/summary.go
index 552d6cf2..edf1da5d 100644
--- a/summary/summary.go
+++ b/summary/summary.go
@@ -5,8 +5,10 @@
package summary // import "github.com/wabarc/wayback/summary"
import (
- "fmt"
"strings"
+
+ "github.com/wabarc/wayback/config"
+ "github.com/wabarc/wayback/ingress"
)
// Summarizer is the interface that wraps the basic Summarize method.
@@ -16,29 +18,23 @@ type Summarizer interface {
Summarize(s string) (string, error)
}
-// Interface guard
-var _ Summarizer = (*Summary)(nil)
-
// Summary provides a high-level interface for generating text summaries using
// different summarization methods.
type Summary struct {
Handler interface{}
}
-// Summarize generates a summary of the input text using the selected summarization method.
-// It returns the summary as a string and any error that occurred during summarization.
-func (sum *Summary) Summarize(s string) (string, error) {
- s = strings.TrimSpace(s)
- if s == "" {
- return "", fmt.Errorf("text not found")
+// NewSummary creates and returns a Summarizer based on the configured LLM provider.
+// It inspects opts.LLMProvider() (case-insensitive) and constructs a provider-specific
+// handler. It falls back to the legacy summarizer implementation.
+// The returned Summarizer wraps the chosen handler.
+func NewSummary(opts *config.Options) Summarizer {
+ switch strings.ToLower(opts.LLMProvider()) {
+ case "cohere":
+ return NewCohere(ingress.Client(), opts)
+ case "openrouter":
+ return NewOpenRouter(ingress.Client(), opts)
}
- switch handler := sum.Handler.(type) {
- case *Cohere:
- return handler.Summarize(s)
- case *Legacy:
- return handler.Summarize(s)
- default:
- return "", fmt.Errorf("invalid handler")
- }
+ return NewLegacy()
}
diff --git a/summary/summary_test.go b/summary/summary_test.go
index 19566be4..0db62241 100644
--- a/summary/summary_test.go
+++ b/summary/summary_test.go
@@ -7,8 +7,8 @@ package summary // import "github.com/wabarc/wayback/summary"
import (
"testing"
- "github.com/cohere-ai/cohere-go"
"github.com/wabarc/helper"
+ "github.com/wabarc/wayback/config"
)
func TestSummarize(t *testing.T) {
@@ -17,39 +17,40 @@ func TestSummarize(t *testing.T) {
mux.HandleFunc("/", handleFunc)
- cohereClient := &cohere.Client{Client: *httpClient, BaseURL: server.URL + "/"}
- coh := &Cohere{client: cohereClient}
+ t.Setenv("WAYBACK_LLM_PROVIDER", "cohere")
+ t.Setenv("WAYBACK_LLM_APIKEY", "test-key")
+
+ parser := config.NewParser()
+ opts, err := parser.ParseEnvironmentVariables()
+ if err != nil {
+ t.Fatalf("Parse environment variables or flags failed, error: %v", err)
+ }
+
+ coh := NewCohere(httpClient, opts)
tests := []struct {
name string
- handler interface{}
+ handler Summarizer
input string
wantErr bool
errMessage string
}{
{
- name: "valid Cohere handler",
+ name: "Valid Cohere handler",
handler: coh,
input: "This is a test string.",
wantErr: false,
errMessage: "",
},
{
- name: "valid Locally handler",
+ name: "Valid Locally handler",
handler: NewLegacy(),
input: "This is a test string.",
wantErr: false,
errMessage: "",
},
{
- name: "invalid handler",
- handler: "invalid-handler",
- input: "This is a test string.",
- wantErr: true,
- errMessage: "invalid handler",
- },
- {
- name: "empty input",
+ name: "Empty input",
handler: coh,
input: "",
wantErr: true,
@@ -59,9 +60,7 @@ func TestSummarize(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- sum := &Summary{Handler: tt.handler}
-
- _, err := sum.Summarize(tt.input)
+ _, err := tt.handler.Summarize(tt.input)
if (err != nil) != tt.wantErr {
t.Fatalf(`Unexpected error status. Got "%v", but wanted error="%v"`, err, tt.wantErr)
diff --git a/wayback.1 b/wayback.1
index d89cd1aa..9d6dcc70 100644
--- a/wayback.1
+++ b/wayback.1
@@ -224,8 +224,16 @@ Directory to store binary file, e.g. PDF, html file\&.
.B WAYBACK_MAX_MEDIA_SIZE
Max size to limit download stream media. default 512MB\&.
.TP
-.B WAYBACK_COHERE_APIKEY
-Cohere API key\&.
+.B WAYBACK_LLM_PROVIDER
+Enables AI-enhanced summary. Provider options: cohere | openrouter\&.
+.TP
+.B WAYBACK_LLM_APIKEY
+LLM API key\&.
+.TP
+.B WAYBACK_LLM_MODEL
+LLM model. Each provider has a sensible default:
+.br
+cohere: command-a-03-2025 | openrouter: openrouter/auto\&.
.TP
.B WAYBACK_MEDIA_SITES
Extra media websites wish to be supported, separate with comma\&.
diff --git a/wayback.conf b/wayback.conf
index b227e50c..291c278d 100644
--- a/wayback.conf
+++ b/wayback.conf
@@ -76,7 +76,9 @@ WAYBACK_USERAGENT=WaybackArchiver/1.0
WAYBACK_FALLBACK=off
WAYBACK_PROXY=
WAYBACK_PRIVACY_URL=
-WAYBACK_COHERE_APIKEY=
+WAYBACK_LLM_PROVIDER=
+WAYBACK_LLM_APIKEY=
+WAYBACK_LLM_MODEL=
# ipfs slot: infura, pinata
# doc: https://github.com/wabarc/ipfs-pinner#supported-pinning-services
From c841bb70caa43ee765bd71c80e1f216fd8cf7f36 Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Sun, 26 Apr 2026 14:56:47 +0000
Subject: [PATCH 6/8] Make linter happy
---
summary/chat.go | 8 ++++----
summary/legacy.go | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/summary/chat.go b/summary/chat.go
index 2f913479..35383ec4 100644
--- a/summary/chat.go
+++ b/summary/chat.go
@@ -4,7 +4,7 @@
package summary // import "github.com/wabarc/wayback/summary"
-const systemPrompt = `You are a digital archivist and information synthesiser, your expertise lies in distilling "noise" from legacy web data into high-signal summaries.
+const systemPrompt = `You are a digital archivist and information synthesizer, your expertise lies in distilling "noise" from legacy web data into high-signal summaries.
Rules:
@@ -22,8 +22,8 @@ type chatMessage struct {
}
type chatRequest struct {
- Messages []chatMessage `json:"messages"`
Model string `json:"model"`
+ Messages []chatMessage `json:"messages"`
}
type chatContent struct {
@@ -32,13 +32,13 @@ type chatContent struct {
}
type chatChoice struct {
- Contents []chatContent `json:"content"`
Message chatMessage `json:"message,omitempty"`
Role string `json:"role"`
+ Contents []chatContent `json:"content"`
}
type chatResponse struct {
Message chatChoice `json:"message,omitempty"`
- Choices []chatChoice `json:"choices,omitempty"`
ID string `json:"id"`
+ Choices []chatChoice `json:"choices,omitempty"`
}
diff --git a/summary/legacy.go b/summary/legacy.go
index b69fed9f..e25f70bf 100644
--- a/summary/legacy.go
+++ b/summary/legacy.go
@@ -35,7 +35,7 @@ func (l *Legacy) Summarize(s string) (string, error) {
return "", fmt.Errorf("text not found")
}
- l.Bag.MaxCharacters = maxCharacters
+ l.MaxCharacters = maxCharacters
res, err := l.Bag.Summarize(s, 1)
if err != nil {
return "", fmt.Errorf("summarize failed: %v", err)
From a07631ef27304d414ce21e6f978dfab1e6728d98 Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Sun, 3 May 2026 03:19:31 +0000
Subject: [PATCH 7/8] Minor changes
---
summary/chat.go | 3 +--
summary/openrouter.go | 10 +++++-----
summary/summary.go | 6 ------
3 files changed, 6 insertions(+), 13 deletions(-)
diff --git a/summary/chat.go b/summary/chat.go
index 35383ec4..eb477ac6 100644
--- a/summary/chat.go
+++ b/summary/chat.go
@@ -6,7 +6,6 @@ package summary // import "github.com/wabarc/wayback/summary"
const systemPrompt = `You are a digital archivist and information synthesizer, your expertise lies in distilling "noise" from legacy web data into high-signal summaries.
-
Rules:
- Summary point must be anchored by specific verbatim quotes
- Ignore UI elements (navbars, footers) and focus on the core content
@@ -14,7 +13,7 @@ Rules:
- Summary must be in the same language as the source content
- Do NOT repeat ideas from previous snapshots unless conditions have materially changed
-The output should be a maximum of 280 plain text characters.`
+The output should be a maximum of 280 plain paragraphs.`
type chatMessage struct {
Role string `json:"role"`
diff --git a/summary/openrouter.go b/summary/openrouter.go
index d6cd87f1..56a06c07 100644
--- a/summary/openrouter.go
+++ b/summary/openrouter.go
@@ -46,14 +46,14 @@ func NewOpenRouter(c *http.Client, opts *config.Options) *OpenRouter {
// Summarize generates a summary of the input text using OpenRouter's AI models.
// Returns the generated summary as a string and an error, if any.
-func (coh *OpenRouter) Summarize(s string) (string, error) {
+func (or *OpenRouter) Summarize(s string) (string, error) {
s = strings.TrimSpace(s)
if s == "" {
return "", fmt.Errorf("text not found")
}
body := chatRequest{
- Model: coh.model,
+ Model: or.model,
Messages: []chatMessage{
{Role: "system", Content: systemPrompt},
{Role: "user", Content: s},
@@ -70,16 +70,16 @@ func (coh *OpenRouter) Summarize(s string) (string, error) {
return "", fmt.Errorf("failed to make request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
- req.Header.Set("Authorization", "Bearer "+coh.apiKey)
+ req.Header.Set("Authorization", "Bearer "+or.apiKey)
- res, err := coh.client.Do(req)
+ res, err := or.client.Do(req)
if err != nil {
return "", err
}
defer res.Body.Close()
if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices {
- return "", fmt.Errorf("cohere api error: status %d", res.StatusCode)
+ return "", fmt.Errorf("openrouter api error: status %d", res.StatusCode)
}
var cr chatResponse
diff --git a/summary/summary.go b/summary/summary.go
index edf1da5d..90f4ac31 100644
--- a/summary/summary.go
+++ b/summary/summary.go
@@ -18,12 +18,6 @@ type Summarizer interface {
Summarize(s string) (string, error)
}
-// Summary provides a high-level interface for generating text summaries using
-// different summarization methods.
-type Summary struct {
- Handler interface{}
-}
-
// NewSummary creates and returns a Summarizer based on the configured LLM provider.
// It inspects opts.LLMProvider() (case-insensitive) and constructs a provider-specific
// handler. It falls back to the legacy summarizer implementation.
From a45498e6b38c6677b7ef15568206c2696cc1165b Mon Sep 17 00:00:00 2001
From: Wayback Archiver <66856220+waybackarchiver@users.noreply.github.com>
Date: Sun, 3 May 2026 14:55:55 +0000
Subject: [PATCH 8/8] Update prompt
---
summary/chat.go | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/summary/chat.go b/summary/chat.go
index eb477ac6..340be066 100644
--- a/summary/chat.go
+++ b/summary/chat.go
@@ -13,6 +13,12 @@ Rules:
- Summary must be in the same language as the source content
- Do NOT repeat ideas from previous snapshots unless conditions have materially changed
+FORMATTING RULES (STRICT):
+- STRICTOR PROHIBITION: Do not use Markdown bolding (**text**)
+- Use ONLY plain text without any formatting
+- Use simple line breaks to separate points
+- Do NOT use headers or bold labels
+
The output should be a maximum of 280 plain paragraphs.`
type chatMessage struct {