watermarkDetect/classify.py at master · AidHamza/watermarkDetect · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Copyright (C) 2013 Wesley Baugh
"""Tools for text classification.
Extracted from the [infer](https://github.com/bwbaugh/infer) library.
"""
from __future__ import division

import math
from collections import defaultdict, namedtuple, Counter
from fractions import Fraction


class MultinomialNB(object):
    """Multinomial Naive Bayes for text classification.
    Attributes:
        exact: Boolean indicating if exact probabilities should be
            returned as a `Fraction`. Otherwise, speed up computations
            but only return probabilities as a `float`. (default False)
        laplace: Smoothing parameter >= 0. (default 1)
        top_features: Number indicating the top-k most common features
            to use during classification, sorted by the frequency the
            feature has been seen (a count is kept for each label). This
            is a form of feature selection because any feature that has
            a frequency less than any of the top-k most common features
            is ignored during classification. This value must be set
            before any training of the classifier. (default None)
    Properties:
        labels: Set of all class labels.
        vocabulary: Set of vocabulary across all class labels.
    """
    Prediction = namedtuple('Prediction', 'label confidence')

    def __init__(self, *documents):
        """Create a new Multinomial Naive Bayes classifier.
        Args:
            documents: Optional list of document-label pairs for training.
        """
        self.exact = False
        self.laplace = 1
        self.top_features = None
        # Dictionary of sets of vocabulary by label.
        self._label_vocab = defaultdict(set)
        # Dictionary of times a label has been seen.
        self._label_count = Counter()
        # Dictionary of number of feature seen in all documents by label.
        self._label_length = Counter()
        # Dictionary of times a feature has been seen by label.
        self._label_feature_count = defaultdict(Counter)
        # Size of vocabulary across all class labels.
        self._vocab_size = 0
        if documents:
            self.train(*documents)

    @property
    def labels(self):
        """Set of all class labels.
        Returns:
            Example: set(['positive', 'negative'])
        """
        return set(label for label in self._label_count)

    @property
    def vocabulary(self):
        """Set of vocabulary (features) seen in any class label."""
        label_vocab = [self._label_vocab[x] for x in self._label_vocab]
        return set().union(*label_vocab)

    def train(self, *documents):
        """Train the classifier on a document-label pair(s).
        Args:
            documents: Tuple of (document, label) pair(s). Documents
                must be a collection of features. The label can be any
                hashable object, though is usually a string.
        """
        for document, label in documents:
            # Python 3: isinstance(document, str)
            if isinstance(document, basestring):
                raise TypeError('Documents must be a collection of features')
            self._label_count[label] += 1
            for feature in document:
                # Check if the feature hasn't been seen before for any label.
                if not any(feature in self._label_vocab[x] for x in self.labels):
                    self._vocab_size += 1
                self._label_vocab[label].add(feature)
                self._label_feature_count[label][feature] += 1
                self._label_length[label] += 1
                if self.top_features:
                    if not hasattr(self, '_most_common'):
                        x = lambda: MostCommon(self.top_features)
                        self._most_common = defaultdict(x)
                    y = self._label_feature_count[label][feature]
                    self._most_common[label][feature] = y

    def prior(self, label):
        """Prior probability of a label.
        Args:
            label: The target class label.
            self.exact
        Returns:
            The number of training instances that had the target
            `label`, divided by the total number of training instances.
        """
        if label not in self.labels:
            raise KeyError(label)
        total = sum(self._label_count.values())
        if self.exact:
            return Fraction(self._label_count[label], total)
        else:
            return self._label_count[label] / total

    def conditional(self, feature, label):
        """Conditional probability for a feature given a label.
        Args:
            feature: The target feature.
            label: The target class label.
            self.laplace
            self.exact
        Returns:
            The number of times the feature has been present across all
            training documents for the `label`, divided by the sum of
            the length of every training document for the `label`.
        """
        # Note we use [Laplace smoothing][laplace].
        # [laplace]: https://en.wikipedia.org/wiki/Additive_smoothing
        if label not in self.labels:
            raise KeyError(label)

        # Times feature seen across all documents in a label.
        numer = self.laplace
        # Avoid creating an entry if the term has never been seen
        if feature in self._label_feature_count[label]:
            numer += self._label_feature_count[label][feature]
        denom = self._label_length[label] + (self._vocab_size * self.laplace)
        if self.exact:
            return Fraction(numer, denom)
        else:
            return numer / denom

    def _score(self, document, label):
        """Multinomial raw score of a document given a label.
        Args:
            document: Collection of features.
            label: The target class label.
            self.exact
        Returns:
            The multinomial raw score of the `document` given the
            `label`. In order to turn the raw score into a confidence
            value, this value should be divided by the sum of the raw
            scores across all class labels.
        """
        if isinstance(document, basestring):
            raise TypeError('Documents must be a list of features')

        if self.exact:
            score = self.prior(label)
        else:
            score = math.log(self.prior(label))

        for feature in document:
            # Feature selection by only considering the top-k
            # most common features (a form of dictionary trimming).
            if self.top_features and feature not in self._most_common[label]:
                continue
            conditional = self.conditional(feature, label)
            if self.exact:
                score *= conditional
            else:
                score += math.log(conditional)

        return score

    def _compute_scores(self, document):
        """Compute the multinomial score of a document for all labels.
        Args:
            document: Collection of features.
        Returns:
            A dict mapping class labels to the multinomial raw score
            for the `document` given the label.
        """
        return {x: self._score(document, x) for x in self.labels}

    def prob_all(self, document):
        """Probability of a document for all labels.
        Args:
            document: Collection of features.
            self.exact
        Returns:
            A dict mapping class labels to the confidence value that the
            `document` belongs to the label.
        """
        score = self._compute_scores(document)
        if not self.exact:
            # If the log-likelihood is too small, when we convert back
            # using `math.exp`, the result will round to zero.
            normalize = max(score.itervalues())
            assert normalize <= 0, normalize
            score = {x: math.exp(score[x] - normalize) for x in score}
        total = sum(score[x] for x in score)
        assert total > 0, (total, score, normalize)
        if self.exact:
            return {label: Fraction(score[label], total) for label in
                    self.labels}
        else:
            return {label: score[label] / total for label in self.labels}

    def prob(self, document, label):
        """Probability of a document given a label.
        Args:
            document: Collection of features.
            label: The target class label.
        Returns:
            The confidence value that the `document` belongs to `label`.
        """
        prob = self.prob_all(document)[label]
        return prob

    def classify(self, document):
        """Get the most confident class label for a document.
        Args:
            document: Collection of features.
        Returns:
            A namedtuple representing the most confident class `label`
            and the value of the `confidence` in the label. For example:
            As tuple:
                ('positive', 0.85)
            As namedtuple:
                Prediction(label='positive', confidence=0.85)
        """
        prob = self.prob_all(document)
        label = max(prob, key=prob.get)
        return self.Prediction(label, prob[label])