From 274784be3ba96ddfc9920d07388e622a977fb341 Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Sun, 29 Mar 2026 21:58:56 -0400 Subject: [PATCH] v1.0.0: bug fixes, cleanup, and modernization Bug fixes: - Fix findall using original iterator instead of collected copy for partitioning (would fail with generators) - Fix findnearest adaptive threshold: compare now reads the atomic min_score so the threshold progressively tightens during search - Remove redundant fetch.() calls in findnearest and findall Breaking changes: - Remove deprecated OptimalStringAlignement binding (use OptimalStringAlignment) - Remove deprecated findmax (use findnearest) Cleanup: - Fix misleading [1] indexing on common_prefix() in JaroWinkler (returns Int, not tuple) - Fix NMD duplicate docstring line - Add comment explaining the max_dist fallback pattern - Fix README: Hamming listed as SemiMetric, actually Metric Modernization: - Bump Julia compat from 1.3 to 1.6 - Update CI to test on Julia 1.6 instead of 1.3 Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 2 +- Project.toml | 4 ++-- README.md | 2 +- src/StringDistances.jl | 2 ++ src/distances/edit.jl | 3 +-- src/distances/qgram.jl | 1 - src/find.jl | 12 ++++-------- 7 files changed, 11 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5149143..b643c00 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: version: - - '1.3' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index 00ae5cc..42ff651 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StringDistances" uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" -version = "0.11.4" +version = "1.0.0" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" @@ -9,7 +9,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" [compat] Distances = "0.8.1, 0.9, 0.10" StatsAPI = "1" -julia = "1.3" +julia = "1.6" [extras] Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/README.md b/README.md index f08e416..94105e4 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ String distances act over any pair of iterators that define `length` (e.g. `Abst The available distances are: - Edit Distances - - Hamming Distance `Hamming() <: SemiMetric` + - Hamming Distance `Hamming() <: Metric` - [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiMetric` - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric` - [Optimal String Alignment Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignment() <: SemiMetric` diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 40ad1af..4ade284 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -16,6 +16,8 @@ function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1:: end +# Fallback: swallow `max_dist` keyword for distances that don't support it, +# so callers like Partial/TokenMax can pass max_dist generically. (dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2) include("utils.jl") include("distances/edit.jl") diff --git a/src/distances/edit.jl b/src/distances/edit.jl index 6802494..1bc9f9f 100755 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -98,7 +98,7 @@ function (dist::JaroWinkler)(s1, s2) (s1 === missing) | (s2 === missing) && return missing out = Jaro()(s1, s2) if out <= dist.threshold - l = common_prefix(s1, s2)[1] + l = common_prefix(s1, s2) out = (1 - min(l, dist.maxlength) * dist.p) * out end return out @@ -246,7 +246,6 @@ function (dist::OptimalStringAlignment)(s1, s2; max_dist::Union{Integer, Nothing return Int(current) end -Base.@deprecate_binding OptimalStringAlignement OptimalStringAlignment """ DamerauLevenshtein() diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 01838ac..936be46 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -104,7 +104,6 @@ eval_end(::Overlap, c::NTuple{3, <:Integer}) = 1 - c[3] / min(c[1], c[2]) """ NMD(q::Int) - NMD(q::Int) Creates a NMD (Normalized Multiset Distance) as introduced by Besiris and Zigouris 2013. The goal with this distance is to behave similarly to a normalized diff --git a/src/find.jl b/src/find.jl index 2f8d21c..51b1b8f 100644 --- a/src/find.jl +++ b/src/find.jl @@ -47,7 +47,7 @@ function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_sc chunk_score_tasks = map(data_chunks) do chunk Threads.@spawn begin map(chunk) do x - score = compare(_preprocessed_s, _preprocess(dist, x), dist; min_score = min_score) + score = compare(_preprocessed_s, _preprocess(dist, x), dist; min_score = min_score_atomic[]) Threads.atomic_max!(min_score_atomic, score) score end @@ -57,7 +57,7 @@ function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_sc # retrieve return type of `compare` for type stability in task _self_cmp = compare(_preprocessed_s, _preprocessed_s, dist; min_score = min_score) chunk_scores = fetch.(chunk_score_tasks)::Vector{Vector{typeof(_self_cmp)}} - scores = reduce(vcat, fetch.(chunk_scores)) + scores = reduce(vcat, chunk_scores) imax = argmax(scores) iszero(scores) ? (nothing, nothing) : (_citr[imax], imax) @@ -67,10 +67,6 @@ _preprocess(dist::AbstractQGramDistance, ::Missing) = missing _preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q) _preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s -function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0) - @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)" - findnearest(s, itr, dist; min_score = min_score) -end """ findall(s, itr , dist::StringDistance; min_score = 0.8) @@ -99,7 +95,7 @@ function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_s _preprocessed_s = _preprocess(dist, s) chunk_size = max(1, length(_citr) รท (2 * Threads.nthreads())) - data_chunks = Iterators.partition(itr, chunk_size) + data_chunks = Iterators.partition(_citr, chunk_size) isempty(data_chunks) && return empty(eachindex(_citr)) chunk_score_tasks = map(data_chunks) do chunk @@ -114,6 +110,6 @@ function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_s _self_cmp = compare(_preprocessed_s, _preprocessed_s, dist; min_score = min_score) chunk_scores::Vector{Vector{typeof(_self_cmp)}} = fetch.(chunk_score_tasks) - scores = reduce(vcat, fetch.(chunk_scores)) + scores = reduce(vcat, chunk_scores) return findall(>=(min_score), scores) end