diff --git a/task-01/completed.md b/task-01/completed.md index bca9187..baec4d1 100644 --- a/task-01/completed.md +++ b/task-01/completed.md @@ -1,2 +1,2 @@ ## Those who have completed this task: - +hsnee diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index 4535204..f7c4c2c 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -9,12 +9,24 @@ def extract_data_lines(filename, start_text, end_text): open `filename`, and yield the lines between the line that contains `start_text` and the line that contains `end_text` """ - # fill in code as needed + turn_on = False with open(filename) as fh: - for line in fh: - # fill in code as needed - # use `yield line` to return desired lines but keep the function going + for i,line in enumerate(fh): + if turn_on=='done': break + if end_text in line: + if include_end: + turn_on = 'done' + yield line + break + + if turn_on: yield line + + if start_text in line: + if include_start: + turn_on = True + yield line + turn_on = True if __name__ == '__main__': filename = 'top5names.html' diff --git a/task-04/README.md b/task-04/README.md index 1336400..3521252 100644 --- a/task-04/README.md +++ b/task-04/README.md @@ -23,7 +23,7 @@ year, ranking, and gender altogether. A possible way to encode the information i ```python year = 2017 - index // 10 rank = index % 5 + 1 -gender = 'female' if index % 10 < 5 else 'male` +gender = 'female' if index % 10 < 5 else 'male' ``` While this data model preserves all the information, it is unlikely that this model will be very convenient when it comes to data exploration. diff --git a/task-07/README.md b/task-07/README.md new file mode 100644 index 0000000..4173f3d --- /dev/null +++ b/task-07/README.md @@ -0,0 +1,33 @@ +# Task 7: Data scraping + +*prerequisites*: [Task 3](../task-03), [Task 4](../task-04) + +Finally, we will now actually do the data scraping! + +The data model we will use for this task would be +a table with 4 columns: year, gender, rank, name. +Each row in this table corresponds to one cell that contains one name +in the original table on the website. + +Each year will result in 10 rows. The first two rows of this table would look like: + +| year | gender | rank | name | +| ---- |--------| -----| ---- | +| 2017 | female | 1 | Emma | +| 2017 | female | 2 | Olivia | + + +## Steps + +*(Do Step 1-3 in `task-07/get_top_names.py`)* + +1. Copy the answer from Task 3 to complete the function `extract_data_lines` +2. Complete the data scraping: add a for loop that iterates over `data_lines` and + append a tuple to `records` for each name (corresponds to each row in the new table). +3. Use the table we constructed (stored as a pandas data frame to answer the questions in Task 4.) +4. Submit a pull request for your solution. + +## Food for thought: +- Do you think this is a good data model? Why or why not? +- What assumptions did you make when you implement Step 2? + How likely will your scraping method fail if the underlying webpage source changes? diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py new file mode 100644 index 0000000..71217e4 --- /dev/null +++ b/task-07/get_top_names.py @@ -0,0 +1,76 @@ +""" +get_top_names.py +For astrophg/learning-by-doing: Task 7 +https://github.com/astropgh/learning-by-doing/tree/master/task-07 +""" + +import pandas as pd +import re +from itertools import groupby + +def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False): + """ + open `filename`, and yield the lines between + the line that contains `start_text` and the line that contains `end_text` + """ + turn_on = False + with open(filename) as fh: + for i, line in enumerate(fh): + if turn_on == 'done': break + + if end_text in line: + if include_end: + turn_on = 'done' + yield line + break + + if turn_on: yield line + + if start_text in line: + if include_start: + turn_on = True + yield line + turn_on = True + + +if __name__ == '__main__': + filename = '../task-03/top5names.html' + start_text = '