From 31ddbeeeacdfacdce564f6814df28f03f3192d57 Mon Sep 17 00:00:00 2001 From: ongzexuan Date: Wed, 10 Jun 2020 13:03:30 -0400 Subject: [PATCH 1/3] Fixed _extract_post_id returning null values caused by multiple items in page with class _5pcq --- scraper.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index 7d66606..8394874 100644 --- a/scraper.py +++ b/scraper.py @@ -36,9 +36,11 @@ def _extract_link(item): def _extract_post_id(item): postIds = item.find_all(class_="_5pcq") post_id = "" - for postId in postIds: - post_id = f"https://www.facebook.com{postId.get('href')}" - return post_id + for postLink in postLinks: + currlink = postLink.find('a').get('href') + if currlink.startswith('/'): + link = currlink.split('?')[0] + return link def _extract_image(item): From e223f9906b3efd71443b8a60c1d1b64af3a511d1 Mon Sep 17 00:00:00 2001 From: ongzexuan Date: Wed, 10 Jun 2020 13:17:24 -0400 Subject: [PATCH 2/3] Fixed variable naming issue in _extract_post_id --- scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper.py b/scraper.py index 8394874..7b504e1 100644 --- a/scraper.py +++ b/scraper.py @@ -39,8 +39,8 @@ def _extract_post_id(item): for postLink in postLinks: currlink = postLink.find('a').get('href') if currlink.startswith('/'): - link = currlink.split('?')[0] - return link + post_id = currlink.split('?')[0] + return post_id def _extract_image(item): From 5dd39c58feaff22a4e252853ec4ed1e2602baa74 Mon Sep 17 00:00:00 2001 From: ongzexuan Date: Wed, 10 Jun 2020 13:23:26 -0400 Subject: [PATCH 3/3] Fixed variable naming confusion between _extract_link and _extract_post_id --- scraper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index 7b504e1..056b8af 100644 --- a/scraper.py +++ b/scraper.py @@ -36,10 +36,10 @@ def _extract_link(item): def _extract_post_id(item): postIds = item.find_all(class_="_5pcq") post_id = "" - for postLink in postLinks: - currlink = postLink.find('a').get('href') + for postId in postIds: + currlink = postId.get('href') if currlink.startswith('/'): - post_id = currlink.split('?')[0] + post_id = f"https://www.facebook.com{currlink.split('?')[0]}" return post_id