Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
fail-fast: false
matrix:
version:
- '1.3'
- '1.6'
- '1' # automatically expands to the latest stable 1.x release of Julia
os:
- ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.11.4"
version = "1.0.0"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand All @@ -9,7 +9,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
[compat]
Distances = "0.8.1, 0.9, 0.10"
StatsAPI = "1"
julia = "1.3"
julia = "1.6"

[extras]
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ String distances act over any pair of iterators that define `length` (e.g. `Abst

The available distances are:
- Edit Distances
- Hamming Distance `Hamming() <: SemiMetric`
- Hamming Distance `Hamming() <: Metric`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiMetric`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
- [Optimal String Alignment Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignment() <: SemiMetric`
Expand Down
2 changes: 2 additions & 0 deletions src/StringDistances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::
end


# Fallback: swallow `max_dist` keyword for distances that don't support it,
# so callers like Partial/TokenMax can pass max_dist generically.
(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2)
include("utils.jl")
include("distances/edit.jl")
Expand Down
3 changes: 1 addition & 2 deletions src/distances/edit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ function (dist::JaroWinkler)(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
out = Jaro()(s1, s2)
if out <= dist.threshold
l = common_prefix(s1, s2)[1]
l = common_prefix(s1, s2)
out = (1 - min(l, dist.maxlength) * dist.p) * out
end
return out
Expand Down Expand Up @@ -246,7 +246,6 @@ function (dist::OptimalStringAlignment)(s1, s2; max_dist::Union{Integer, Nothing
return Int(current)
end

Base.@deprecate_binding OptimalStringAlignement OptimalStringAlignment

"""
DamerauLevenshtein()
Expand Down
1 change: 0 additions & 1 deletion src/distances/qgram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ eval_end(::Overlap, c::NTuple{3, <:Integer}) = 1 - c[3] / min(c[1], c[2])

"""
NMD(q::Int)
NMD(q::Int)

Creates a NMD (Normalized Multiset Distance) as introduced by Besiris and
Zigouris 2013. The goal with this distance is to behave similarly to a normalized
Expand Down
12 changes: 4 additions & 8 deletions src/find.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_sc
chunk_score_tasks = map(data_chunks) do chunk
Threads.@spawn begin
map(chunk) do x
score = compare(_preprocessed_s, _preprocess(dist, x), dist; min_score = min_score)
score = compare(_preprocessed_s, _preprocess(dist, x), dist; min_score = min_score_atomic[])
Threads.atomic_max!(min_score_atomic, score)
score
end
Expand All @@ -57,7 +57,7 @@ function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_sc
# retrieve return type of `compare` for type stability in task
_self_cmp = compare(_preprocessed_s, _preprocessed_s, dist; min_score = min_score)
chunk_scores = fetch.(chunk_score_tasks)::Vector{Vector{typeof(_self_cmp)}}
scores = reduce(vcat, fetch.(chunk_scores))
scores = reduce(vcat, chunk_scores)

imax = argmax(scores)
iszero(scores) ? (nothing, nothing) : (_citr[imax], imax)
Expand All @@ -67,10 +67,6 @@ _preprocess(dist::AbstractQGramDistance, ::Missing) = missing
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s

function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
findnearest(s, itr, dist; min_score = min_score)
end

"""
findall(s, itr , dist::StringDistance; min_score = 0.8)
Expand Down Expand Up @@ -99,7 +95,7 @@ function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_s
_preprocessed_s = _preprocess(dist, s)

chunk_size = max(1, length(_citr) ÷ (2 * Threads.nthreads()))
data_chunks = Iterators.partition(itr, chunk_size)
data_chunks = Iterators.partition(_citr, chunk_size)
isempty(data_chunks) && return empty(eachindex(_citr))

chunk_score_tasks = map(data_chunks) do chunk
Expand All @@ -114,6 +110,6 @@ function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_s
_self_cmp = compare(_preprocessed_s, _preprocessed_s, dist; min_score = min_score)
chunk_scores::Vector{Vector{typeof(_self_cmp)}} = fetch.(chunk_score_tasks)

scores = reduce(vcat, fetch.(chunk_scores))
scores = reduce(vcat, chunk_scores)
return findall(>=(min_score), scores)
end
Loading