From 8e89677822879ae7c7b16dbf148afa17ca086db6 Mon Sep 17 00:00:00 2001
From: Husni Almoubayyed <husni@physics.org>
Date: Tue, 2 Oct 2018 16:06:29 -0400
Subject: [PATCH 01/11] task-01 completed

---
 task-01/completed.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/task-01/completed.md b/task-01/completed.md
index bca9187..baec4d1 100644
--- a/task-01/completed.md
+++ b/task-01/completed.md
@@ -1,2 +1,2 @@
 ## Those who have completed this task:
-
+hsnee

From e0d2f4caf2cf2fe2647470d669a325546de10020 Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Tue, 2 Oct 2018 14:10:25 -0400
Subject: [PATCH 02/11] bug fix

---
 task-03/get_top_names.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py
index b77911c..4535204 100644
--- a/task-03/get_top_names.py
+++ b/task-03/get_top_names.py
@@ -16,7 +16,7 @@ def extract_data_lines(filename, start_text, end_text):
             # use `yield line` to return desired lines but keep the function going
 
 
-if name == '__main__':
+if __name__ == '__main__':
     filename = 'top5names.html'
     start_text = '<tr><td align="center">2017</td>'
     end_text = '</table></center></div><!-- end #content -->'

From 8a160151249a8cccea5f1d6ae8fcd7648ed7f822 Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Tue, 2 Oct 2018 19:35:40 -0400
Subject: [PATCH 03/11] add task 4

---
 task-04/README.md | 68 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 task-04/README.md

diff --git a/task-04/README.md b/task-04/README.md
new file mode 100644
index 0000000..1336400
--- /dev/null
+++ b/task-04/README.md
@@ -0,0 +1,68 @@
+# Task 4: [database]  Preparing for data scraping: design a data model for top baby names
+
+## Background
+
+Before we start to scrape the top baby names from the webpage, we need to design
+a data model that we will use to store the data.
+
+The term "data model" has different meanings in different contexts.
+We can ask what kind of object the data will be stored in.
+A python list? A python dictionary? A pandas data frame?
+For a given type, we can further ask how the data is stored.
+For example, if we store the data in a pandas data frame, we can ask what
+are the columns and rows.
+
+Let's look at some examples.
+The original webpage store the names as a table, with columns being
+`year`, `female_rank1`, `female_rank2`, `male_rank1`, `male_rank2`..., and
+each row corresponds to one single year.
+
+A more extreme example would be storing the names as a sequence (say a python list),
+the content of the sequence will be the names, while the indices of the sequence encode
+year, ranking, and gender altogether. A possible way to encode the information is
+```python
+year = 2017 - index // 10
+rank = index % 5 + 1
+gender = 'female' if index % 10 < 5 else 'male`
+```
+While this data model preserves all the information, it is unlikely that this
+model will be very convenient when it comes to data exploration.
+
+Yet another totally different data model is to group the data by names.
+Let's say we'll store the data in a python dictionary. A possible way is:
+```python
+{
+    'Emma':{
+        'gender': 'female',
+        'years_ranked_1': [2017, 2016, 2015, 2014, ...],
+        'years_ranked_2': [2013, 2012, 2009, ...],
+        'years_ranked_3': [...],
+    },
+    'Noah':{
+        ...,
+    },
+    ...,
+}
+```
+
+Note that the form (object) that the data is stored and how the data is structured
+are two different things. (*Food for thoughts: why? can you give an example?*)
+
+Clearly, the choice of data model heavily depends on the questions that we would
+like to answer with the data.
+If the amount of data is very large, we will also need to consider the avabilable
+computing resources like memory usage and I/O speed when designing the data model.
+For now, we don't yet need to worry about the limitation due to computing resources.
+
+
+## Task
+
+Try to come up with a data model that is good for answering each of the following questions.
+Think about the code you'll need to write to interact with the data model to answer
+these questions.
+
+1. Which years Emma is the most chosen names?
+2. Which name had been the most chosen name for the longest consecutive years?
+3. How many unique male names have be on top 5 between years 1980 and 2000?
+4. Are there more unique male names or more unique female names that are on top 5?
+5. What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?

From 458086869b9d18d5cd9df2d8a164c58bcd0a1e65 Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Wed, 3 Oct 2018 14:07:26 -0400
Subject: [PATCH 04/11] add task-01 solution

---
 task-01/solution.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 task-01/solution.md

diff --git a/task-01/solution.md b/task-01/solution.md
new file mode 100644
index 0000000..a194ab8
--- /dev/null
+++ b/task-01/solution.md
@@ -0,0 +1,43 @@
+# Solution to Task 1: [git] Fork a repo and submit a pull request
+
+## Steps
+
+1. Fork `astropgh/learning-by-doing` repository
+
+> Click the "fork" button on the upper right corner on GitHub.
+
+2. Clone your fork
+
+```bash
+git clone git@github.com:yourusername/learning-by-doing.git
+```
+
+3. Checkout a new branch called `task/01`
+
+```bash
+cd learning-by-doing
+git checkout -b task/01
+```
+
+4. Add your GitHub username to `task-01/completed.md`
+
+```bash
+echo "yourusername" >> task-01/completed.md
+```
+
+5. Commit your change to `task/01`
+
+```bash
+git add task-01/completed.md
+git commit -m "add my username to complete task 01"
+```
+
+6. Push `task/01` to your fork
+
+```bash
+git push origin task/01
+```
+
+7. Submit a pull request
+
+> Click "Create pull request" button on GitHub

From b033c2b5288f1c32fb569ee813d3bbb7e2879e79 Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Wed, 3 Oct 2018 22:39:39 -0400
Subject: [PATCH 05/11] fix typo

---
 task-02/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/task-02/README.md b/task-02/README.md
index 4b418ae..ab38dd1 100644
--- a/task-02/README.md
+++ b/task-02/README.md
@@ -16,8 +16,7 @@
 - https://help.github.com/articles/configuring-a-remote-for-a-fork/
 - https://help.github.com/articles/syncing-a-fork/
 
-## Food for thoughts
+## Food for thought
 - What's the difference between a fork and a clone?
 - What's the difference between `origin` and `upstream` in this case?
 - What's the benefit to work on new branches like `task/01` and `task/02`, rather than on `master` directly?
-

From 633554a795eb51162cf65cd1172c45e0289be15a Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Wed, 3 Oct 2018 22:42:52 -0400
Subject: [PATCH 06/11] add task 5

---
 task-05/README.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 task-05/README.md

diff --git a/task-05/README.md b/task-05/README.md
new file mode 100644
index 0000000..8c91522
--- /dev/null
+++ b/task-05/README.md
@@ -0,0 +1,27 @@
+# Task 5: [git] Merge and rebase
+
+*prerequisites*: [Task 1](../task-01), [Task 2](../task-02)
+
+We will now learn two basic operations of git branches: merge and rebase.
+As always, you can find lots of information about this on the Internet,
+and here we will go ahead to learn by trying them out.
+
+## Part 1
+Complete Level 1 through 4 on https://learngitbranching.js.org/
+
+## Part 2
+1. Go back to your clone of `learning-by-doing`. Make sure you've completed Tasks [1](../task-01) and [2](../task-02).
+2. Do **only** Step 2 of [Task 2](../task-02) again.
+3. Now the `master` branch and your `task/01` branch have diverged, and you will rebase `task/01` onto `master`.
+4. Go to see your PR at https://github.com/astropgh/learning-by-doing/pulls, does it somehow change? Why?
+
+## Part 3
+*Note: Do Part 2 first!*
+
+1. Checkout a new branch called `task/05` from `master` (*What does this mean?*)
+2. Add a new file `task-05/test` and commit it to `task/05`
+3. Merge `task/05` into `task/01`
+4. Go to see your PR at https://github.com/astropgh/learning-by-doing/pulls, does it somehow change? Why?
+
+## Food for thought
+- What's the difference between "rebase" and "merge"?

From fc6cc2ad5db2982c198cb6f3286d7ef315c8f2a1 Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Wed, 3 Oct 2018 22:43:00 -0400
Subject: [PATCH 07/11] add task 6

---
 task-06/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 task-06/README.md

diff --git a/task-06/README.md b/task-06/README.md
new file mode 100644
index 0000000..d86075f
--- /dev/null
+++ b/task-06/README.md
@@ -0,0 +1,9 @@
+# Task 6: [database] Basic SQL
+
+Complete "Basic SQL" Lessons 1 through 6 on https://community.modeanalytics.com/sql/
+
+## Extension
+Complete "Basic SQL" Lessons 7 through 15 on https://community.modeanalytics.com/sql/
+
+## Food for thought
+- After learning the basic SQL operation, would you change your answers to [Task 4](../task-04)?

From 36658e5d0eafbd28d2d72d37739e89fcfcf294a8 Mon Sep 17 00:00:00 2001
From: Husni Almoubayyed <husni@physics.org>
Date: Sun, 7 Oct 2018 15:16:58 -0400
Subject: [PATCH 08/11] completing get_top_names generator

---
 task-03/get_top_names.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py
index 4535204..f7c4c2c 100644
--- a/task-03/get_top_names.py
+++ b/task-03/get_top_names.py
@@ -9,12 +9,24 @@ def extract_data_lines(filename, start_text, end_text):
     open `filename`, and yield the lines between
     the line that contains `start_text` and the line that contains `end_text`
     """
-    # fill in code as needed
+    turn_on = False
     with open(filename) as fh:
-        for line in fh:
-            # fill in code as needed
-            # use `yield line` to return desired lines but keep the function going
+        for i,line in enumerate(fh):
+            if turn_on=='done': break
 
+            if end_text in line:
+                if include_end:
+                    turn_on = 'done'
+                    yield line
+                break
+
+            if turn_on: yield line
+
+            if start_text in line:
+                if include_start:
+                    turn_on = True
+                    yield line
+                turn_on = True
 
 if __name__ == '__main__':
     filename = 'top5names.html'

From e11afd74a2754de2fc2d6f0f6ef7d350cd4417de Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Tue, 9 Oct 2018 18:07:10 -0400
Subject: [PATCH 09/11] fix typo

---
 task-04/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task-04/README.md b/task-04/README.md
index 1336400..3521252 100644
--- a/task-04/README.md
+++ b/task-04/README.md
@@ -23,7 +23,7 @@ year, ranking, and gender altogether. A possible way to encode the information i
 ```python
 year = 2017 - index // 10
 rank = index % 5 + 1
-gender = 'female' if index % 10 < 5 else 'male`
+gender = 'female' if index % 10 < 5 else 'male'
 ```
 While this data model preserves all the information, it is unlikely that this
 model will be very convenient when it comes to data exploration.

From a6eb2553333fcaa59e2098b4f70e88fe42488bff Mon Sep 17 00:00:00 2001
From: Yao-Yuan Mao <yymao.astro@gmail.com>
Date: Tue, 9 Oct 2018 18:07:22 -0400
Subject: [PATCH 10/11] add task 7

---
 task-07/README.md        | 33 +++++++++++++++++++++++++++++++++
 task-07/get_top_names.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 task-07/README.md
 create mode 100644 task-07/get_top_names.py

diff --git a/task-07/README.md b/task-07/README.md
new file mode 100644
index 0000000..4173f3d
--- /dev/null
+++ b/task-07/README.md
@@ -0,0 +1,33 @@
+# Task 7: Data scraping
+
+*prerequisites*: [Task 3](../task-03), [Task 4](../task-04)
+
+Finally, we will now actually do the data scraping!
+
+The data model we will use for this task would be
+a table with 4 columns: year, gender, rank, name.
+Each row in this table corresponds to one cell that contains one name
+in the original table on the website.
+
+Each year will result in 10 rows. The first two rows of this table would look like:
+
+| year | gender | rank | name |
+| ---- |--------| -----| ---- |
+| 2017 | female | 1 | Emma |
+| 2017 | female | 2 | Olivia |
+
+
+## Steps
+
+*(Do Step 1-3 in `task-07/get_top_names.py`)*
+
+1. Copy the answer from Task 3 to complete the function `extract_data_lines`
+2. Complete the data scraping: add a for loop that iterates over `data_lines` and
+   append a tuple to `records` for each name (corresponds to each row in the new table).
+3. Use the table we constructed (stored as a pandas data frame to answer the questions in Task 4.)
+4. Submit a pull request for your solution.
+
+## Food for thought:
+- Do you think this is a good data model? Why or why not?
+- What assumptions did you make when you implement Step 2?
+  How likely will your scraping method fail if the underlying webpage source changes?
diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py
new file mode 100644
index 0000000..b506456
--- /dev/null
+++ b/task-07/get_top_names.py
@@ -0,0 +1,34 @@
+"""
+get_top_names.py
+For astrophg/learning-by-doing: Task 7
+https://github.com/astropgh/learning-by-doing/tree/master/task-07
+"""
+
+import pandas as pd
+
+def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False):
+    # Add your code for Step 1 here
+    # Just copy your answer from task-03
+    # Remove the raise statement when you are done
+    raise NotImplementedError
+
+
+if __name__ == '__main__':
+    filename = '../task-03/top5names.html'
+    start_text = '<tr><td align="center">2017</td>'
+    end_text = '</table></center></div><!-- end #content -->'
+
+    records = []
+    data_lines = extract_data_lines(filename, start_text, end_text, include_start=True)
+
+    # Add your code for Step 2 here
+    # This will involve a for loop that iterates over `data_lines`
+    # For each row, you will append a tuple to `records`
+
+    data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name'])
+
+    # Add your code for Step 3 here
+    # You will use `data` to find and print out the answers for each questions listed in Task 4
+
+    # For example, to answer question 1:
+    print(data.query('name == "Emma"').query('rank == 1')['year'].tolist())

From 8d26ad10fcfeaa847314242573fac7a5690778e0 Mon Sep 17 00:00:00 2001
From: Husni Almoubayyed <husni@physics.org>
Date: Sat, 13 Oct 2018 13:44:30 -0400
Subject: [PATCH 11/11] finishing task 7

---
 task-07/get_top_names.py | 58 ++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py
index b506456..71217e4 100644
--- a/task-07/get_top_names.py
+++ b/task-07/get_top_names.py
@@ -5,12 +5,32 @@
 """
 
 import pandas as pd
+import re
+from itertools import groupby
 
 def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False):
-    # Add your code for Step 1 here
-    # Just copy your answer from task-03
-    # Remove the raise statement when you are done
-    raise NotImplementedError
+    """
+    open `filename`, and yield the lines between
+    the line that contains `start_text` and the line that contains `end_text`
+    """
+    turn_on = False
+    with open(filename) as fh:
+        for i, line in enumerate(fh):
+            if turn_on == 'done': break
+
+            if end_text in line:
+                if include_end:
+                    turn_on = 'done'
+                    yield line
+                break
+
+            if turn_on: yield line
+
+            if start_text in line:
+                if include_start:
+                    turn_on = True
+                    yield line
+                turn_on = True
 
 
 if __name__ == '__main__':
@@ -20,15 +40,37 @@ def extract_data_lines(filename, start_text, end_text, include_start=False, incl
 
     records = []
     data_lines = extract_data_lines(filename, start_text, end_text, include_start=True)
+    for data_line in data_lines:
 
-    # Add your code for Step 2 here
-    # This will involve a for loop that iterates over `data_lines`
-    # For each row, you will append a tuple to `records`
+    genders = {i:'Female' if i<5 else 'Male' for i in range(10)}
+    for line in extract_data_lines(filename, start_text, end_text, include_start=True, include_end=False):
+            line = re.split(' |align="center"|<td|>|<td>|<tr>|</td>|</tr>|\n', line)
+            line = list(filter(None, line))
+            if len(line)==1:
+                year = int(line[0])
+            else:
+                for i, name in enumerate(line):
+                    records.append((year, genders[i], i%5+1, name))
 
     data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name'])
 
     # Add your code for Step 3 here
     # You will use `data` to find and print out the answers for each questions listed in Task 4
 
-    # For example, to answer question 1:
+    # For example, to answer question 1 Which years Emma is the most chosen names?:
     print(data.query('name == "Emma"').query('rank == 1')['year'].tolist())
+
+    # question 2 Which name had been the most chosen name for the longest consecutive years?
+    for gender in ["Male", "Female"]:
+        df1 = df.query('gender == "'+str(gender)+'"').query('rank == 1')['name'].tolist()
+        print(gender+' name with most occurences is ',
+            sorted([(name, sum(1 for _ in occurence)) for name, occurence in groupby(df2)], key=lambda x: x[1])[:-1][0][0])
+    # question 3 How many unique male names have be on top 5 between years 1980 and 2000?
+    print(len(set(df[np.logical_and.reduce((df['gender']=='Male', df['rank']<=6 , df['year']>=1980, df['year']<=2000))]['name'])))
+
+    # question 4 Are there more unique male names or more unique female names that are on top 5? prints True if more unique male names
+    print(len(set(df[np.logical_and(df['gender']=='Male', df['rank']<=5)]['name']))/len(set(df[np.logical_and(df['gender']=='Female', df['rank']<=5)]['name']))>1)
+
+    # question 5 What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?
+    df2 = df.query('gender == "Male"').query('rank == 1')['name'].tolist()
+    np.histogram([(sum(1 for _ in occurence)) for _, occurence in groupby(df2)])