-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetting-data.py
More file actions
137 lines (103 loc) · 3.78 KB
/
getting-data.py
File metadata and controls
137 lines (103 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
import time
import sys
import json
import datetime
import os
#get access token
if not os.path.isfile('access'):
print("please place a Github access token in this directory.")
sys.exit()
with open('access', 'r') as accestoken:
access = accestoken.readline().replace("\n","")
#get mined commits
repositories = {}
with open('all_commits.json', 'r') as infile:
repositories = json.load(infile)
datafilter = {}
#load progress
if os.path.isfile('DataFilter.json'):
with open('DataFilter.json', 'r') as infile:
datafilter = json.load(infile)
if not 'showcase' in datafilter:
datafilter['showcase'] = {}
if not 'no-python' in datafilter:
datafilter['no-python'] = {}
if not 'python' in datafilter:
datafilter['python'] = {}
print(str(len(datafilter['showcase'])) + " repositories are showcases and therefore ignored.")
print(str(len(datafilter['no-python'])) + " repositories don't even contain ANY python.")
print(str(len(datafilter['python'])) + " might contain python.")
data = {}
myheaders = {'Authorization': 'token ' + access}
progress = 0
total = 0
newrepos = 0
nopythonlist = {} #used to mark which commits have and don't have python files modified
#starting to collect requested data
for repo in repositories:
progress = progress + 1
if (((progress % 3000) == 0) and total > 0) and not saved:
print("Time to save.")
saved = True
before = time.time()
with open('DataFilter.json', 'w') as outfile:
json.dump(datafilter, outfile)
with open('PyCommitsWithDiffs.json', 'w') as outfile:
json.dump(data, outfile)
name = repo.split('https://github.com/')[1]
if (name in datafilter['showcase']):
print("skip: showcase")
continue
if (name in datafilter['no-python']):
print("skip: no python")
continue
print("\n" + repo + " " + str(progress))
if not repo in nopythonlist:
nopythonlist[repo] = {}
noPythonAtAll = True
for c in repositories[repo]:
#go through all commits of that repository
if c in nopythonlist[repo]:
#if we already know that the commit has no python, skip it
continue
#otherwise, get the DIFF file
target = repo+'/commit/' + c + '.diff'
response = requests.get(target,headers = myheaders)
#this is the diff
content = response.content
try:
diffcontent = content.decode('utf-8',errors='ignore');
except:
print("an exception occured. Skip.");
continue;
#check if the file contains any python
if (".py" in diffcontent):
noPythonAtAll = False #the repository in general contains at least some python
#put it in data
if not repo in data:
data[repo] = {}
#we should save again when the time is right
total = total + 1
saved = False
#copy the relevant information to 'data'
data[repo][c] = repositories[repo][c]
data[repo][c]["diff"] = content.decode('utf-8',errors='ignore');
else:
if not repo in nopythonlist:
nopythonlist[repo] = {}
#note down that this commit doesn't contain any pyhon files
nopythonlist[repo][c] = {}
if noPythonAtAll:
#note down that this repository doesn't contain any python files
datafilter['no-python'][name] = {}
else:
#repository has some python, and we checked it now
datafilter['python'][name] = {}
print(str(total) + " commits modifying python were found.")
#save the markings of what contains no python data
with open('DataFilter.json', 'w') as outfile:
json.dump(datafilter, outfile)
#save the actual python results
with open('PyCommitsWithDiffs.json', 'w') as outfile:
json.dump(data, outfile)