-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtutorial_2_matrices.py
More file actions
186 lines (156 loc) · 5.59 KB
/
tutorial_2_matrices.py
File metadata and controls
186 lines (156 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#' # MAlign Tutorial 2: Scoring Matrices
#'
#' This tutorial explains how to create, use, and customize scoring matrices
#' for alignment operations.
#'
#' ## What are Scoring Matrices?
#'
#' Scoring matrices define the cost/benefit of aligning symbols:
#' - **Match**: High positive score (symbols align well)
#' - **Mismatch**: Lower or negative score (symbols don't match)
#' - **Gap**: Penalty for inserting gaps
#' - **Asymmetric**: A->B can differ from B->A
#'
#' ## Creating Matrices
import malign
#' ### Default (Identity Matrix)
#' If no matrix is provided, MAlign creates an identity matrix automatically:
sequences = ["ACGT", "AGCT"]
alms = malign.align(sequences, k=1) # Uses default identity matrix
print("Default matrix alignment:")
print(malign.tabulate_alms(alms))
#' ### From Sequences
#'
#' Create a matrix with simple match/mismatch scoring:
matrix = malign.ScoringMatrix.from_sequences(
sequences=[["A", "C", "G", "T"], ["A", "C", "G", "T"]],
match=2.0,
mismatch=-1.0,
gap_score=-1.5,
)
alms = malign.align(sequences, k=1, matrix=matrix)
print("\nCustom match/mismatch matrix:")
print(malign.tabulate_alms(alms))
#' ### Cross-Domain Matrices
#'
#' When aligning sequences from different alphabets (e.g., Latin and Cyrillic),
#' the two domains have different symbol sets:
matrix = malign.ScoringMatrix.from_sequences(
sequences=[["A", "C", "G", "T"], ["\u0410", "\u0412", "\u0413", "\u0422"]],
match=1.0,
mismatch=-0.5,
gap_score=-1.0,
)
print(f"\nCross-domain matrix: {len(matrix.domains)} domains")
print(f"Domain 0 symbols: {list(matrix.domains[0])}")
print(f"Domain 1 symbols: {list(matrix.domains[1])}")
#' ### From Phonological Features (distfeat)
#'
#' The `from_distfeat()` factory builds matrices from phonological feature
#' distances. Sounds that share more features get higher alignment scores.
#' Requires `pip install malign[features]`.
try:
matrix = malign.ScoringMatrix.from_distfeat(
sequences=[["p", "t", "k", "b", "d", "g"], ["p", "t", "k", "b", "d", "g"]],
gap="-",
gap_score=-1.0,
)
# p-b differ only in voicing: high similarity
print("\nFeature-based scores:")
print(f" p-b (voicing only): {matrix[('p', 'b')]:.3f}")
print(f" p-d (voicing+place): {matrix[('p', 'd')]:.3f}")
print(f" p-g (voicing+place): {matrix[('p', 'g')]:.3f}")
print(f" p-p (identical): {matrix[('p', 'p')]:.3f}")
# Note: from_distfeat() is symmetric
print(f" Symmetric: p-b == b-p? {matrix[('p', 'b')] == matrix[('b', 'p')]}")
except ImportError:
print("\n(Skipping distfeat example -- install with: pip install malign[features])")
#' ### From Substitution Counts
#'
#' When you have observed substitution frequencies (e.g., from a corpus of
#' cognate pairs), use `from_substitution_counts()` to build an asymmetric
#' log-odds matrix:
counts = {
("p", "b"): 15, # p -> b observed 15 times (voicing, common)
("b", "p"): 3, # b -> p observed 3 times (devoicing, rare)
("p", "p"): 20, # p -> p (identity)
("b", "b"): 18, # b -> b (identity)
("t", "d"): 10, # t -> d (voicing)
("d", "t"): 2, # d -> t (devoicing, rare)
("t", "t"): 25,
("d", "d"): 20,
}
matrix = malign.ScoringMatrix.from_substitution_counts(counts)
print("\nAsymmetric log-odds scores from counts:")
print(f" p->b (common): {matrix[('p', 'b')]:.3f}")
print(f" b->p (rare): {matrix[('b', 'p')]:.3f}")
print(f" Asymmetric: {matrix[('p', 'b')] != matrix[('b', 'p')]}")
#' ### From YAML Files
#'
#' Matrices can be saved to and loaded from YAML files for reuse:
# Save a matrix
matrix = malign.ScoringMatrix.from_sequences(
sequences=[["A", "C", "G"], ["A", "C", "G"]],
match=1.0,
mismatch=-0.5,
gap_score=-1.0,
)
# matrix.save("my_matrix.yml")
# Load it back
# loaded = malign.ScoringMatrix.from_yaml("my_matrix.yml")
#' ### Manual Construction
#'
#' For complete control, construct a ScoringMatrix directly:
scores = {
("A", "A"): 2.0,
("A", "C"): -1.0,
("A", "-"): -2.0,
("C", "A"): -0.5, # Asymmetric: C->A differs from A->C
("C", "C"): 2.0,
("C", "-"): -2.0,
("-", "A"): -2.0,
("-", "C"): -2.0,
("-", "-"): 0.0,
}
matrix = malign.ScoringMatrix(
scores=scores,
domains=[["-", "A", "C"], ["-", "A", "C"]],
gap="-",
impute_method=None,
)
print(f"\nManual matrix: {len(matrix.scores)} score entries")
print(f" A->C = {matrix[('A', 'C')]:.1f}")
print(f" C->A = {matrix[('C', 'A')]:.1f} (asymmetric!)")
#' ## Matrix Imputation
#'
#' Sparse matrices (with missing symbol pairs) can be filled automatically:
matrix = malign.ScoringMatrix(
scores={("A", "A"): 1.0, ("C", "C"): 1.0},
domains=[["-", "A", "C", "G"], ["-", "A", "C", "G"]],
gap="-",
impute_method="mean", # also "median" or "zero"
)
# Missing scores are imputed on access
print("\nImputed matrix:")
print(f" A-A (known): {matrix[('A', 'A')]:.3f}")
print(f" A-C (imputed): {matrix[('A', 'C')]:.3f}")
#' ## Inspecting Matrices
#'
#' Use `tabulate()` to view the full matrix:
matrix = malign.ScoringMatrix.from_sequences(
sequences=[["A", "C", "G"], ["A", "C", "G"]],
match=1.0,
mismatch=-0.5,
gap_score=-1.0,
)
print("\nMatrix table:")
print(matrix.tabulate())
#' ## Summary
#'
#' | Factory Method | Symmetric? | Input | Use Case |
#' |---|---|---|---|
#' | `from_sequences()` | Yes | Symbol lists | Quick testing |
#' | `from_distfeat()` | Yes | IPA segments | Phonological knowledge |
#' | `from_substitution_counts()` | No | Frequency counts | Observed sound changes |
#' | `from_yaml()` | Either | YAML file | Reuse saved matrices |
#' | `ScoringMatrix()` | Either | Score dict | Full manual control |