crossResourceGithubScraper/crossResourcePyScript.py at master · throughput-ec/crossResourceGithubScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import requests
import json
import csv
import os
import time
import sys


def main(language, packages_filename, ghtoken=""):
    """
    Main function for the scraper. Read package list, query Github, write results.

    :param str language: python or r
    :param str packages_filename: Filename of csv that contains packages list
    :param str ghtoken: Github token
    :return none:
    """

    # Was a Github token provided to this function?
    if not ghtoken:
        # No Github token provided, attempt to read in "gh.token" file
        ghtoken = read_gh_token()

    # Read packages list from CSV file
    packlist = read_pkg_csv(os.path.join("data", "input", packages_filename))
    # Query Github with the package list
    results = collect_data(packlist, ghtoken, language)
    # Write out the results to a csv file
    write_csv(results)
    return


def read_gh_token():
    """
    Read the Github token from the 'gh.token' file

    :return str ghtoken: Github token
    """
    # If you have your Github token stored in a file, read it in from the file.
    try:
        print(os.listdir())
        with open(os.path.join("data", "input", "gh.token"), "r") as f:
            reader = f.readlines()
            ghtoken = reader[0]
            print(ghtoken)
    except FileNotFoundError:
        print("No Github token provided and no 'gh.token' file was found. Please provide one and try again.")
        sys.exit(0)

    return ghtoken


def read_pkg_csv(filename):
    """
    Open the CSV file and read in the package list.

    :param str filename: Csv file containing package list
    :return list packlist: Package list
    """
    packlist = []
    with open(filename) as csvfile:
        # Csv reader
        reader = csv.reader(csvfile, delimiter=",")
        for row in reader:
            package = row[0]
            # Ignore the header row
            if package != "package":
                # Append individual package name
                packlist.append(package)

    return packlist


def write_csv(results):
    """
    Write out the results to a CSV file

    :param list results: Query results from Github
    :return none:
    """

    # The headers we want to write out
    _headers = ["package", "crossover_package", "crossover_file"]

    # List to be written out
    _writeout = []

    # Organize results into a csv-friendly writeable list
    for line in results:
        _tmp = []
        for header in _headers:
            try:
                _tmp.append(line[header])
            except KeyError:
                _tmp.append(" ")
        _writeout.append(_tmp)

    # Write new (or overwrite) csv file with organized results
    with open("scrapeGitResults.csv", "w") as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        writer.writerow(_headers)
        for row in _writeout:
            writer.writerow(row)
    return


def collect_data(packlist, ghtoken, language):
    """
    Make a github query for each package in the package list.

    :param list packlist: Package list
    :param str ghtoken:  Github token
    :param str language: python or r
    :return list results: Results from Github queries
    """
    next_url = ""
    results = []

    # Loop for each package
    for pkg in packlist:
        print("Querying Github: {}".format(pkg))
        # Format the URL to GET. Query format depends on the language
        if language.lower() == "r":
            next_url = 'https://api.github.com/search/code?q="library({})"+in:file+language:"r"+extension:"r"' \
                       '+extension:"rmd"'.format(pkg)
        if language.lower() == "python" or language.lower() == "py":
            next_url = 'https://api.github.com/search/code?q="import {}"+in:file+language:"python"+' \
                       'extension:"py"'.format(pkg)
            # TODO Do we also try to query this string for packages listed in multi-import statements too? This is not
            #  a PEP8 standard, but some people still do multi-imports like this anyways. Rare
            # next_url = 'https://api.github.com/search/code?q=", {}"+in:file+language:"python"+' \
            #        'extension:"py"'.format(pkg)

        # In case there are multiple pages of results, we need to keep requesting the "next" page until all results
        # are collected
        while next_url:
            # Send the query, and organize the results
            results, next_url = send_query(pkg, next_url, ghtoken, results)
            # Is there another page of results?
            if not next_url:
                # No, we're done querying pages for this package.
                break

    # All done. Return all results.
    return results


def send_query(pkg, req_url, ghtoken, results):
    """
    Send one request to the Github API. Sort the results into a list of objects, and return them.

    :param str pkg: Current package being queried
    :param str req_url: URL for the GET request
    :param str ghtoken: Github Token
    :param list results: Results (so far)
    :return list results: Results (with new additions)
    """

    # Placeholder for the next page url
    next_url = ""
    try:
        # Make the github request. Look for 'import <package_name>' in code in python files
        r = requests.get(
            req_url,
            headers={"Authorization": "token {}".format(ghtoken), "Accept": "application/vnd.github.v3+json"})

        # Did the query come back successful?
        if r.status_code == 200:
            try:
                # Load the response json as a Python dictionary
                r_text = json.loads(r.text)
                # Loop through each query result
                print("Result Items: {}".format(r_text["items"]))

                # Is there another page of results?
                if r.links["next"]:
                    # Store the link to the next page
                    next_url = r.links["next"]["url"]

                # Loop each result in the response
                for result in r_text["items"]:
                    try:
                        # We don't need all the data from the results. Save the few pieces of info that
                        # we're interested in.
                        results.append({"package": pkg, "crossover_file": result["html_url"],
                                        "crossover_package": result["repository"]["name"]})
                    except KeyError:
                        # This result was missing a piece of data that we need.
                        print("Error parsing a result for: {}".format(pkg))
            except Exception as e:
                # There was a problem trying to parse the json response, or 'items' key is not in the response
                # results
                print("Missing data from Github response object for package: {}".format(pkg))
        else:
            # There was a bad HTTP response from Github. If the error is 403,
            # then we likely hit the rate limiter and need to increase the sleep time between requests.
            print("Received error from Github API: Status Code: {}".format(r.status_code))

        # Don't query github too fast or you'll hit the limiter and get a bad response. Give it some time.
        time.sleep(10)

    except Exception as e:
        # Did your internet connection go out? Something is wrong with sending out the request
        print("Unable to make Github request. Connection issues.")

    return results, next_url


main("python", "packagesToScrape.csv", ghtoken="")