From 8e89677822879ae7c7b16dbf148afa17ca086db6 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Tue, 2 Oct 2018 16:06:29 -0400 Subject: [PATCH 01/11] task-01 completed --- task-01/completed.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task-01/completed.md b/task-01/completed.md index bca9187..baec4d1 100644 --- a/task-01/completed.md +++ b/task-01/completed.md @@ -1,2 +1,2 @@ ## Those who have completed this task: - +hsnee From e0d2f4caf2cf2fe2647470d669a325546de10020 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Tue, 2 Oct 2018 14:10:25 -0400 Subject: [PATCH 02/11] bug fix --- task-03/get_top_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index b77911c..4535204 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -16,7 +16,7 @@ def extract_data_lines(filename, start_text, end_text): # use `yield line` to return desired lines but keep the function going -if name == '__main__': +if __name__ == '__main__': filename = 'top5names.html' start_text = '2017' end_text = '' From 8a160151249a8cccea5f1d6ae8fcd7648ed7f822 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Tue, 2 Oct 2018 19:35:40 -0400 Subject: [PATCH 03/11] add task 4 --- task-04/README.md | 68 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 task-04/README.md diff --git a/task-04/README.md b/task-04/README.md new file mode 100644 index 0000000..1336400 --- /dev/null +++ b/task-04/README.md @@ -0,0 +1,68 @@ +# Task 4: [database] Preparing for data scraping: design a data model for top baby names + +## Background + +Before we start to scrape the top baby names from the webpage, we need to design +a data model that we will use to store the data. + +The term "data model" has different meanings in different contexts. +We can ask what kind of object the data will be stored in. +A python list? A python dictionary? A pandas data frame? +For a given type, we can further ask how the data is stored. +For example, if we store the data in a pandas data frame, we can ask what +are the columns and rows. + +Let's look at some examples. +The original webpage store the names as a table, with columns being +`year`, `female_rank1`, `female_rank2`, `male_rank1`, `male_rank2`..., and +each row corresponds to one single year. + +A more extreme example would be storing the names as a sequence (say a python list), +the content of the sequence will be the names, while the indices of the sequence encode +year, ranking, and gender altogether. A possible way to encode the information is +```python +year = 2017 - index // 10 +rank = index % 5 + 1 +gender = 'female' if index % 10 < 5 else 'male` +``` +While this data model preserves all the information, it is unlikely that this +model will be very convenient when it comes to data exploration. + +Yet another totally different data model is to group the data by names. +Let's say we'll store the data in a python dictionary. A possible way is: +```python +{ + 'Emma':{ + 'gender': 'female', + 'years_ranked_1': [2017, 2016, 2015, 2014, ...], + 'years_ranked_2': [2013, 2012, 2009, ...], + 'years_ranked_3': [...], + }, + 'Noah':{ + ..., + }, + ..., +} +``` + +Note that the form (object) that the data is stored and how the data is structured +are two different things. (*Food for thoughts: why? can you give an example?*) + +Clearly, the choice of data model heavily depends on the questions that we would +like to answer with the data. +If the amount of data is very large, we will also need to consider the avabilable +computing resources like memory usage and I/O speed when designing the data model. +For now, we don't yet need to worry about the limitation due to computing resources. + + +## Task + +Try to come up with a data model that is good for answering each of the following questions. +Think about the code you'll need to write to interact with the data model to answer +these questions. + +1. Which years Emma is the most chosen names? +2. Which name had been the most chosen name for the longest consecutive years? +3. How many unique male names have be on top 5 between years 1980 and 2000? +4. Are there more unique male names or more unique female names that are on top 5? +5. What is the distribution of the numbers of consecutive years that a male name remains the most chosen name? From 458086869b9d18d5cd9df2d8a164c58bcd0a1e65 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 14:07:26 -0400 Subject: [PATCH 04/11] add task-01 solution --- task-01/solution.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 task-01/solution.md diff --git a/task-01/solution.md b/task-01/solution.md new file mode 100644 index 0000000..a194ab8 --- /dev/null +++ b/task-01/solution.md @@ -0,0 +1,43 @@ +# Solution to Task 1: [git] Fork a repo and submit a pull request + +## Steps + +1. Fork `astropgh/learning-by-doing` repository + +> Click the "fork" button on the upper right corner on GitHub. + +2. Clone your fork + +```bash +git clone git@github.com:yourusername/learning-by-doing.git +``` + +3. Checkout a new branch called `task/01` + +```bash +cd learning-by-doing +git checkout -b task/01 +``` + +4. Add your GitHub username to `task-01/completed.md` + +```bash +echo "yourusername" >> task-01/completed.md +``` + +5. Commit your change to `task/01` + +```bash +git add task-01/completed.md +git commit -m "add my username to complete task 01" +``` + +6. Push `task/01` to your fork + +```bash +git push origin task/01 +``` + +7. Submit a pull request + +> Click "Create pull request" button on GitHub From b033c2b5288f1c32fb569ee813d3bbb7e2879e79 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 22:39:39 -0400 Subject: [PATCH 05/11] fix typo --- task-02/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/task-02/README.md b/task-02/README.md index 4b418ae..ab38dd1 100644 --- a/task-02/README.md +++ b/task-02/README.md @@ -16,8 +16,7 @@ - https://help.github.com/articles/configuring-a-remote-for-a-fork/ - https://help.github.com/articles/syncing-a-fork/ -## Food for thoughts +## Food for thought - What's the difference between a fork and a clone? - What's the difference between `origin` and `upstream` in this case? - What's the benefit to work on new branches like `task/01` and `task/02`, rather than on `master` directly? - From 633554a795eb51162cf65cd1172c45e0289be15a Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 22:42:52 -0400 Subject: [PATCH 06/11] add task 5 --- task-05/README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 task-05/README.md diff --git a/task-05/README.md b/task-05/README.md new file mode 100644 index 0000000..8c91522 --- /dev/null +++ b/task-05/README.md @@ -0,0 +1,27 @@ +# Task 5: [git] Merge and rebase + +*prerequisites*: [Task 1](../task-01), [Task 2](../task-02) + +We will now learn two basic operations of git branches: merge and rebase. +As always, you can find lots of information about this on the Internet, +and here we will go ahead to learn by trying them out. + +## Part 1 +Complete Level 1 through 4 on https://learngitbranching.js.org/ + +## Part 2 +1. Go back to your clone of `learning-by-doing`. Make sure you've completed Tasks [1](../task-01) and [2](../task-02). +2. Do **only** Step 2 of [Task 2](../task-02) again. +3. Now the `master` branch and your `task/01` branch have diverged, and you will rebase `task/01` onto `master`. +4. Go to see your PR at https://github.com/astropgh/learning-by-doing/pulls, does it somehow change? Why? + +## Part 3 +*Note: Do Part 2 first!* + +1. Checkout a new branch called `task/05` from `master` (*What does this mean?*) +2. Add a new file `task-05/test` and commit it to `task/05` +3. Merge `task/05` into `task/01` +4. Go to see your PR at https://github.com/astropgh/learning-by-doing/pulls, does it somehow change? Why? + +## Food for thought +- What's the difference between "rebase" and "merge"? From fc6cc2ad5db2982c198cb6f3286d7ef315c8f2a1 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 22:43:00 -0400 Subject: [PATCH 07/11] add task 6 --- task-06/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 task-06/README.md diff --git a/task-06/README.md b/task-06/README.md new file mode 100644 index 0000000..d86075f --- /dev/null +++ b/task-06/README.md @@ -0,0 +1,9 @@ +# Task 6: [database] Basic SQL + +Complete "Basic SQL" Lessons 1 through 6 on https://community.modeanalytics.com/sql/ + +## Extension +Complete "Basic SQL" Lessons 7 through 15 on https://community.modeanalytics.com/sql/ + +## Food for thought +- After learning the basic SQL operation, would you change your answers to [Task 4](../task-04)? From 36658e5d0eafbd28d2d72d37739e89fcfcf294a8 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Sun, 7 Oct 2018 15:16:58 -0400 Subject: [PATCH 08/11] completing get_top_names generator --- task-03/get_top_names.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index 4535204..f7c4c2c 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -9,12 +9,24 @@ def extract_data_lines(filename, start_text, end_text): open `filename`, and yield the lines between the line that contains `start_text` and the line that contains `end_text` """ - # fill in code as needed + turn_on = False with open(filename) as fh: - for line in fh: - # fill in code as needed - # use `yield line` to return desired lines but keep the function going + for i,line in enumerate(fh): + if turn_on=='done': break + if end_text in line: + if include_end: + turn_on = 'done' + yield line + break + + if turn_on: yield line + + if start_text in line: + if include_start: + turn_on = True + yield line + turn_on = True if __name__ == '__main__': filename = 'top5names.html' From e11afd74a2754de2fc2d6f0f6ef7d350cd4417de Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Tue, 9 Oct 2018 18:07:10 -0400 Subject: [PATCH 09/11] fix typo --- task-04/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task-04/README.md b/task-04/README.md index 1336400..3521252 100644 --- a/task-04/README.md +++ b/task-04/README.md @@ -23,7 +23,7 @@ year, ranking, and gender altogether. A possible way to encode the information i ```python year = 2017 - index // 10 rank = index % 5 + 1 -gender = 'female' if index % 10 < 5 else 'male` +gender = 'female' if index % 10 < 5 else 'male' ``` While this data model preserves all the information, it is unlikely that this model will be very convenient when it comes to data exploration. From a6eb2553333fcaa59e2098b4f70e88fe42488bff Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Tue, 9 Oct 2018 18:07:22 -0400 Subject: [PATCH 10/11] add task 7 --- task-07/README.md | 33 +++++++++++++++++++++++++++++++++ task-07/get_top_names.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 task-07/README.md create mode 100644 task-07/get_top_names.py diff --git a/task-07/README.md b/task-07/README.md new file mode 100644 index 0000000..4173f3d --- /dev/null +++ b/task-07/README.md @@ -0,0 +1,33 @@ +# Task 7: Data scraping + +*prerequisites*: [Task 3](../task-03), [Task 4](../task-04) + +Finally, we will now actually do the data scraping! + +The data model we will use for this task would be +a table with 4 columns: year, gender, rank, name. +Each row in this table corresponds to one cell that contains one name +in the original table on the website. + +Each year will result in 10 rows. The first two rows of this table would look like: + +| year | gender | rank | name | +| ---- |--------| -----| ---- | +| 2017 | female | 1 | Emma | +| 2017 | female | 2 | Olivia | + + +## Steps + +*(Do Step 1-3 in `task-07/get_top_names.py`)* + +1. Copy the answer from Task 3 to complete the function `extract_data_lines` +2. Complete the data scraping: add a for loop that iterates over `data_lines` and + append a tuple to `records` for each name (corresponds to each row in the new table). +3. Use the table we constructed (stored as a pandas data frame to answer the questions in Task 4.) +4. Submit a pull request for your solution. + +## Food for thought: +- Do you think this is a good data model? Why or why not? +- What assumptions did you make when you implement Step 2? + How likely will your scraping method fail if the underlying webpage source changes? diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py new file mode 100644 index 0000000..b506456 --- /dev/null +++ b/task-07/get_top_names.py @@ -0,0 +1,34 @@ +""" +get_top_names.py +For astrophg/learning-by-doing: Task 7 +https://github.com/astropgh/learning-by-doing/tree/master/task-07 +""" + +import pandas as pd + +def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False): + # Add your code for Step 1 here + # Just copy your answer from task-03 + # Remove the raise statement when you are done + raise NotImplementedError + + +if __name__ == '__main__': + filename = '../task-03/top5names.html' + start_text = '2017' + end_text = '' + + records = [] + data_lines = extract_data_lines(filename, start_text, end_text, include_start=True) + + # Add your code for Step 2 here + # This will involve a for loop that iterates over `data_lines` + # For each row, you will append a tuple to `records` + + data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name']) + + # Add your code for Step 3 here + # You will use `data` to find and print out the answers for each questions listed in Task 4 + + # For example, to answer question 1: + print(data.query('name == "Emma"').query('rank == 1')['year'].tolist()) From 8d26ad10fcfeaa847314242573fac7a5690778e0 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Sat, 13 Oct 2018 13:44:30 -0400 Subject: [PATCH 11/11] finishing task 7 --- task-07/get_top_names.py | 58 ++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py index b506456..71217e4 100644 --- a/task-07/get_top_names.py +++ b/task-07/get_top_names.py @@ -5,12 +5,32 @@ """ import pandas as pd +import re +from itertools import groupby def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False): - # Add your code for Step 1 here - # Just copy your answer from task-03 - # Remove the raise statement when you are done - raise NotImplementedError + """ + open `filename`, and yield the lines between + the line that contains `start_text` and the line that contains `end_text` + """ + turn_on = False + with open(filename) as fh: + for i, line in enumerate(fh): + if turn_on == 'done': break + + if end_text in line: + if include_end: + turn_on = 'done' + yield line + break + + if turn_on: yield line + + if start_text in line: + if include_start: + turn_on = True + yield line + turn_on = True if __name__ == '__main__': @@ -20,15 +40,37 @@ def extract_data_lines(filename, start_text, end_text, include_start=False, incl records = [] data_lines = extract_data_lines(filename, start_text, end_text, include_start=True) + for data_line in data_lines: - # Add your code for Step 2 here - # This will involve a for loop that iterates over `data_lines` - # For each row, you will append a tuple to `records` + genders = {i:'Female' if i<5 else 'Male' for i in range(10)} + for line in extract_data_lines(filename, start_text, end_text, include_start=True, include_end=False): + line = re.split(' |align="center"||||||\n', line) + line = list(filter(None, line)) + if len(line)==1: + year = int(line[0]) + else: + for i, name in enumerate(line): + records.append((year, genders[i], i%5+1, name)) data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name']) # Add your code for Step 3 here # You will use `data` to find and print out the answers for each questions listed in Task 4 - # For example, to answer question 1: + # For example, to answer question 1 Which years Emma is the most chosen names?: print(data.query('name == "Emma"').query('rank == 1')['year'].tolist()) + + # question 2 Which name had been the most chosen name for the longest consecutive years? + for gender in ["Male", "Female"]: + df1 = df.query('gender == "'+str(gender)+'"').query('rank == 1')['name'].tolist() + print(gender+' name with most occurences is ', + sorted([(name, sum(1 for _ in occurence)) for name, occurence in groupby(df2)], key=lambda x: x[1])[:-1][0][0]) + # question 3 How many unique male names have be on top 5 between years 1980 and 2000? + print(len(set(df[np.logical_and.reduce((df['gender']=='Male', df['rank']<=6 , df['year']>=1980, df['year']<=2000))]['name']))) + + # question 4 Are there more unique male names or more unique female names that are on top 5? prints True if more unique male names + print(len(set(df[np.logical_and(df['gender']=='Male', df['rank']<=5)]['name']))/len(set(df[np.logical_and(df['gender']=='Female', df['rank']<=5)]['name']))>1) + + # question 5 What is the distribution of the numbers of consecutive years that a male name remains the most chosen name? + df2 = df.query('gender == "Male"').query('rank == 1')['name'].tolist() + np.histogram([(sum(1 for _ in occurence)) for _, occurence in groupby(df2)])