-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
129 lines (113 loc) · 3.73 KB
/
utils.py
File metadata and controls
129 lines (113 loc) · 3.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import nltk
import numpy as np
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from matplotlib.patches import Ellipse
from matplotlib import transforms
def trim (tweet, hyperlink=True, retweet=True, hash=True):
'''
arguments
tweet: Tweet to be processed
hyperlink: boolean, True if you wish to remove the hyperlinks
retweet: boolean, True if you wish to remove the phrase 'RT' from the starting
hash: boolean, True if you wish to remove the hash symbols.
returns:
updated tweet after doing the trimming
'''
if hyperlink:
tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
if retweet:
tweet = re.sub(r'^RT[\s]', '', tweet)
if hash:
tweet = re.sub(r'#', '', tweet)
return tweet
def tokenize (tweet):
'''
arguments:
tweet: the tweet to be tokenized
returns:
tweet_tokens: the list of tokens in tweet
'''
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tweet_tokens = tokenizer.tokenize(tweet)
return tweet_tokens
def removeStopwordsAndPunctuation (tweet_tokens):
'''
arguments:
tweet_tokens: the tweet in tokenized format
returns:
clean_tokens: tweet_tokens but w/o stopwords and punctuation
'''
stopwords_english = stopwords.words('english')
clean_tokens = []
for token in tweet_tokens:
if token not in stopwords_english and token not in string.punctuation:
clean_tokens.append(token)
return clean_tokens
def stem (clean_tokens):
'''
arguments:
clean_tokens: tweet in tokenized form without stopwords
returns:
stemmed_tokens: reduces tokens in clean_tokens to their corresponding stem
'''
stemmer = PorterStemmer()
stemmed_tokens = []
for token in clean_tokens:
stemmed_token = stemmer.stem(token)
stemmed_tokens.append(stemmed_token)
return stemmed_tokens
def sigmoid (z):
'''
arguments:
z: can be a number or a numpy array
returns:
h: sigmoid(z)
'''
h = 1 / (1 + np.exp(-z))
return h
def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
"""
Create a plot of the covariance confidence ellipse of *x* and *y*.
Parameters
----------
x, y : array-like, shape (n, )
Input data.
ax : matplotlib.axes.Axes
The axes object to draw the ellipse into.
n_std : float
The number of standard deviations to determine the ellipse's radiuses.
**kwargs
Forwarded to `~matplotlib.patches.Ellipse`
Returns
-------
matplotlib.patches.Ellipse
"""
if x.size != y.size:
raise ValueError("x and y must be the same size")
cov = np.cov(x, y)
pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
# Using a special case to obtain the eigenvalues of this
# two-dimensional dataset.
ell_radius_x = np.sqrt(1 + pearson)
ell_radius_y = np.sqrt(1 - pearson)
ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2,
facecolor=facecolor, **kwargs)
# Calculating the standard deviation of x from
# the squareroot of the variance and multiplying
# with the given number of standard deviations.
scale_x = np.sqrt(cov[0, 0]) * n_std
mean_x = np.mean(x)
# calculating the standard deviation of y ...
scale_y = np.sqrt(cov[1, 1]) * n_std
mean_y = np.mean(y)
transf = transforms.Affine2D() \
.rotate_deg(45) \
.scale(scale_x, scale_y) \
.translate(mean_x, mean_y)
ellipse.set_transform(transf + ax.transData)
return ax.add_patch(ellipse)