From c701a4372e5d2e555d2eda9a2a4da3a7a10c6f11 Mon Sep 17 00:00:00 2001 From: MrPrajwal12 Date: Tue, 2 Jul 2024 23:28:51 +0530 Subject: [PATCH 1/6] Code for scraping Python blog page --- SJEC_CS106_workshop1 | 1 + python_blog_scraper/Dockerfile | 12 +++ python_blog_scraper/docker-compose.yml | 9 ++ python_blog_scraper/python_web_scrape.py | 121 +++++++++++++++++++++++ python_blog_scraper/requirements.txt | 4 + 5 files changed, 147 insertions(+) create mode 160000 SJEC_CS106_workshop1 create mode 100644 python_blog_scraper/Dockerfile create mode 100644 python_blog_scraper/docker-compose.yml create mode 100644 python_blog_scraper/python_web_scrape.py create mode 100644 python_blog_scraper/requirements.txt diff --git a/SJEC_CS106_workshop1 b/SJEC_CS106_workshop1 new file mode 160000 index 0000000..7b33172 --- /dev/null +++ b/SJEC_CS106_workshop1 @@ -0,0 +1 @@ +Subproject commit 7b33172c6d035f3570848ae829987bd4b8fecda9 diff --git a/python_blog_scraper/Dockerfile b/python_blog_scraper/Dockerfile new file mode 100644 index 0000000..7b1479e --- /dev/null +++ b/python_blog_scraper/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10.2-alpine3.15 +# Create directories +RUN mkdir -p /root/workspace/src +COPY ./python_web_scrape.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib +RUN pip install psycopg2-binary +CMD ["python_web_scrape.py"] +ENTRYPOINT ["python"] \ No newline at end of file diff --git a/python_blog_scraper/docker-compose.yml b/python_blog_scraper/docker-compose.yml new file mode 100644 index 0000000..0876790 --- /dev/null +++ b/python_blog_scraper/docker-compose.yml @@ -0,0 +1,9 @@ +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' \ No newline at end of file diff --git a/python_blog_scraper/python_web_scrape.py b/python_blog_scraper/python_web_scrape.py new file mode 100644 index 0000000..5316336 --- /dev/null +++ b/python_blog_scraper/python_web_scrape.py @@ -0,0 +1,121 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +from psycopg2 import Error + +url = 'https://blog.python.org/' + +def create_connection(db_name, db_user, db_password, db_host, db_port): + try: + connection = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + print("Connection to PostgreSQL DB successful") + return connection + except Error as e: + print(f"The error '{e}' occurred") + return None + +def execute_query(connection, data): + cursor = connection.cursor() + try: + query = """ + INSERT INTO python_blog_articles (date, title, body, author) + VALUES (%s, %s, %s, %s) + """ + cursor.execute(query, data) + connection.commit() + print("Query executed successfully") + except Error as e: + print(f"The error '{e}' occurred") + +def create_table(connection): + cursor = connection.cursor() + try: + create_table_query = """ + CREATE TABLE IF NOT EXISTS python_blog_articles ( + id SERIAL PRIMARY KEY, + date VARCHAR(100), + title TEXT, + body TEXT, + author VARCHAR(100) + ); + """ + cursor.execute(create_table_query) + connection.commit() + print("Table created successfully or already exists") + except Error as e: + print(f"The error '{e}' occurred") + +def process_page(soup, date, titletext, bodytext, author): + for div in soup.find_all('div', class_='date-outer'): + date_header = div.find('h2', class_='date-header') + if date_header: + date_text = date_header.find('span').get_text(strip=True) + date.append(date_text) + + for post in div.find_all('div', class_='post-outer'): + title_head = post.find('h3', class_='post-title entry-title') + if title_head: + titletext.append(title_head.text.strip()) + + content_div = post.find('div', class_='post-body entry-content') + if content_div: + paragraph_text = ' '.join([p.text.strip() for p in content_div.find_all('p')]) + bodytext.append(paragraph_text) + + footer_head = post.find('div', class_='post-footer') + if footer_head: + footer_text = footer_head.find('span', class_='post-author vcard').text.strip() + author.append(footer_text) + +def main(): + db_name = 'webdemo' + db_user = 'postgres' + db_password = '123456' + db_host = 'localhost' + db_port = '5434' + + connection = create_connection(db_name, db_user, db_password, db_host, db_port) + + if connection: + try: + date = [] + titletext = [] + bodytext = [] + author = [] + + res = requests.get(url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup, date, titletext, bodytext, author) + + while len(titletext) < 50: + older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE)) + if older_posts_link: + next_page_url = older_posts_link['href'] + res = requests.get(next_page_url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup, date, titletext, bodytext, author) + else: + break + + create_table(connection) + for i in range(len(titletext)): + data = (date[i], titletext[i], bodytext[i], author[i]) + execute_query(connection, data) + + except Error as e: + print(f"Error: {e}") + + finally: + if connection: + connection.close() + print("PostgreSQL connection is closed") + +if __name__ == "__main__": + main() diff --git a/python_blog_scraper/requirements.txt b/python_blog_scraper/requirements.txt new file mode 100644 index 0000000..02a89c4 --- /dev/null +++ b/python_blog_scraper/requirements.txt @@ -0,0 +1,4 @@ +psycopg2-binary==2.9.3 +beautifulsoup4==4.11.1 +requests==2.27.1 +html5lib==1.1 From b113b2573dfe76b7ac7ba2c8390e9da7e1fbdfa0 Mon Sep 17 00:00:00 2001 From: MrPrajwal12 Date: Wed, 3 Jul 2024 19:50:11 +0530 Subject: [PATCH 2/6] Code for scraping Python blog page --- Homework/DataEngineering-Workshop1 | 1 + {python_blog_scraper => Homework}/Dockerfile | 0 .../docker-compose.yml | 0 .../python_web_scrape.py | 0 .../requirements.txt | 0 SJEC_CS106_workshop1 | 1 - SJEC_CS106_workshop1.zip | Bin 0 -> 27541 bytes python_blog_scraper.zip | Bin 0 -> 2784 bytes 8 files changed, 1 insertion(+), 1 deletion(-) create mode 160000 Homework/DataEngineering-Workshop1 rename {python_blog_scraper => Homework}/Dockerfile (100%) rename {python_blog_scraper => Homework}/docker-compose.yml (100%) rename {python_blog_scraper => Homework}/python_web_scrape.py (100%) rename {python_blog_scraper => Homework}/requirements.txt (100%) delete mode 160000 SJEC_CS106_workshop1 create mode 100644 SJEC_CS106_workshop1.zip create mode 100644 python_blog_scraper.zip diff --git a/Homework/DataEngineering-Workshop1 b/Homework/DataEngineering-Workshop1 new file mode 160000 index 0000000..a90eaa9 --- /dev/null +++ b/Homework/DataEngineering-Workshop1 @@ -0,0 +1 @@ +Subproject commit a90eaa9d64f2a2a98070172efd16d8eef824400e diff --git a/python_blog_scraper/Dockerfile b/Homework/Dockerfile similarity index 100% rename from python_blog_scraper/Dockerfile rename to Homework/Dockerfile diff --git a/python_blog_scraper/docker-compose.yml b/Homework/docker-compose.yml similarity index 100% rename from python_blog_scraper/docker-compose.yml rename to Homework/docker-compose.yml diff --git a/python_blog_scraper/python_web_scrape.py b/Homework/python_web_scrape.py similarity index 100% rename from python_blog_scraper/python_web_scrape.py rename to Homework/python_web_scrape.py diff --git a/python_blog_scraper/requirements.txt b/Homework/requirements.txt similarity index 100% rename from python_blog_scraper/requirements.txt rename to Homework/requirements.txt diff --git a/SJEC_CS106_workshop1 b/SJEC_CS106_workshop1 deleted file mode 160000 index 7b33172..0000000 --- a/SJEC_CS106_workshop1 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7b33172c6d035f3570848ae829987bd4b8fecda9 diff --git a/SJEC_CS106_workshop1.zip b/SJEC_CS106_workshop1.zip new file mode 100644 index 0000000000000000000000000000000000000000..76e2e0040a5ac5435853cdaf4d3a0aedce41f411 GIT binary patch literal 27541 zcmdqJbyOBw(>F{?OG`<2cXu~Pceiwd2udR@CEXz1-HmjYbV#>^Ajr$*xw#L=Pd%LT ze9s^6yVrtC*Tveunc1^vX3xyt&m=*?P=EkKw|+(S`#=8k3l#_tNI^naP)AUKo{mMw z+1k$B-qhNLo>oZ_4hX`6*T*>h_6P9k=mG--47Ldh1oZvav+q+v0zm=|m;O0r_3gBz zznap}+Q8h%j@rQ5(#G1}h{n~@;yZ{f%^v_i0ih(L=^v36FI$m+_Mu-|O14t|8SMvJ z8gW_)>M8~%IS6Rdjg4p`I7lK;&o4hA9z8)77>dvl z=$Rx0r2VjzkH;-j0LK1g8LsS4mK}hBfN+1sve0*yjm<2KzO%aeYgQe@T4mbl5QGlC z1uDI80`b+Z$;K1T5jP5a=9|}RtNi8-7Hw5N*{d*^oOe~-NpaPjUkUP1hfz23C(8tYTh-P>%8ZAa{ z&WNN_Z;%sla~;cCBVQ~hyPUKN?%}4vK$O$f`30e0h^sx{Xqfa4{&YNn#1m;=Ou~7l zY1QGRh5BhdQQV0aAkTt&nVHpFSpuZ?r#$)I|0z!ocpxD1U&+(P)xp%-O2^qqU&r3S zPS3`O#>Vx#WIdn$Qi%#xG^}S?P(03y@nB&ifMmtT4B=-^mJAn+sU+1ku++gk@gyq- zl%6O>JlBW)dcfHrMnFtF9S;R5aCE5I946Ym_Dqfhp*_XCA3VhZgF6+DS5ye5$VMHe ztT2eVFdxcHhBZGBmr;FhutNLuBpEGwh6O{enkT{9o^Q^VPdz=^sr4lsf!x%By-uVH zh7qgOz34LFG7Rc5Y{)(_9!=uCC$CK)pD=ezQRhW{7VWh=EqkxIsbzzvDTBb}g?%c( zlf)d53PsCQ@8KrJ<4Q3q-D^8u z)w;ap@Nu8odMZc>_GX1MvnTRacI3!JlysszHK0Y11khiq(=KfNUe~+z)-Bz{E?zaM z65e3(2+r$}R%34;3PdIs>oc&248yQFvLP(Px40GC(JR}*6t9@(U2$U-E-!6*I(Bw|S+n-xm3J3qHKk=c786Qquy# zgK`1((q!=AySct0lcT4&=9B!Wmm>ICdCrn1phZ?d2dR+(O}Ku>Yb5a{f2NQ;{agft z>}dKS_w=bT8D;7K@mtb8cgZw?ewaF-PgZ6Ai6xm&P08FBXH!mdB z3{{7`D^|80v!xsIkxsL3fo_4qXe%^pqKSV@Q2Prhan>sxc3^V^B9Itvm^ZyJI9{F? zeq++in$9-tyX_hhz;%dGB1lsv{ypj}em$6JJ>wkDKUZyhU@D+h>$O{|(2DSMcdzQ1 zI5w_p!S}7hG>)$oQNBX7wtJbXT~7NRvsPzo#;e-PIQ4BCtBLgxZIijMUHscw@PSc* zCI<<-Gti#Si&=YFw7Ix;ZQd=jjCivy5f!%$^K5~L#Yo6`URc^3;B3<8Y)nz;7UNBX zuhUzt7)AO-*4n3cKu8kEpW2B$$3+orKZr|w3*RZs&%zz=uEgjJ`f?6=lL4IOTA7rO zH#KMs-^>g+g!!%Jk~n{_EQ0-4?88QsFBDs<`viWUM2kV%c|PxF#>C68CjyNeZ_L(D zth{K_1`}KXadKyE=UY<{NToOsx(VJipLcWft6pf&IfkSNZPe2&Zr^frIW1iBT0Kg) zSbF6CA}xiN@$uOUTtOixm6#ZWtiE8~O%<7@XQ-z$4W#hOG~0#ZqK%uuy6h3-eY|KK zqfK<6TSzC9w z4{uL->*Y&te*m99_4a^y*?*W9@3-@sT3egj|1mr1e`W`K{&aRgEOCE!C!|}n2h474 zZ)t61=3s3{?X2fuU}~vnMPsjLY4abK3xK=(`2n_2#nK*`72%u3P2i`)V9TyE@XIWN z$O`ODn54&sF62r0OK%ShHP)!Kh_r%9nVTEyDU)QIdfQa_7-{=*ko>l2hVf^ft4w|mq@c2JI@a~4n#M|7 ztys@K_Qj!Uw`nYwUkI3@9P~HR&eKW1Cx}E=MDLGEQHk_q*kTCG(o$U^Sb98Tn-#^b zNW3=M*H4I4k?{%1XHZYij#;UOSIO7Gd)W!eGWlV)zSt8mRU0Y2U+SCTt1?rjy_-=}&#VoR8M+ui7j@%FSPcjRcSd0>NBq zy2J)aLLtAI#Tu5`f^|eVD7pla@lD}WRUFKSdjm!Zvphk(S6;!~hUhfcvBvf4vuJQAaBgR2sy#g)N)8TAY9~Pvf*}W4ZBC}v=}rOB2pj)% zh355#WRUS6$lsZWjev1n+)V=G97P{P>qW$fX z^xY&@Qd}p9z3^7cu58{6m0@#-gNw`Q<<=0bj=(5~*T+pbmLZ3>$>e?`r$I!RO3w+i zT*#eX)4Z)GPqU+XSMz2*MEk6TI4P-NO7g>LTVflVSE|>u4C5@-(wVXV-4HkHl0OS2 z@xs&nWGJ8@3HJ!pui8r9R2{vJKFhwpmdh^=c9D#IFnZ5Hlz&jdTBYWO&f}GXNLb(J znH+LjVg8)LhMcJA!>{>hL;mf}+9B~FAt)3iryR-ie3a&?Kna2aV`B--va(RjI646m zCZBDS4E;9qpfoi|`+UYz-@LHwe@hhd$XC0z@PJ=-Jt=0(?g?hvUYBxR-eJugZ*4Dh zWKyx);VSAYyZ41thE~$R#3*j+h=T7s11UtMCzsQ)2=}|tB(1#J)rsW1Q6RDHo^u@V z+eIwl(tJ~|!Zn2Wq9c$~5*gh!`A*J2^k4Rp4s*kFACPl3c^KlRn8Pyu*beg)B`C??m*Ny!-}H7SormBTOyyxhKGJ7 zMG5~VPQw_iVB+2Awn?a>w`cHXIF!9IjKc&z`Rc5Dj2?pRo5xuL#9qk8b=dHM`4}yt zmriBo2#pLOiWKA|VZP?9rUXDFfKDS1(` zvm;E?gP1h2wZr8D!R%|ou}iz7Y@*qdRA2b=v@2BCMhV8h8iM)zekdXC6{uwo^VbG@ z#*V|06LSa!wXPzW$Qb~6ot^+<6#+-dm|Kp@U`4}QI#kxe{&>^1$YtBC|Ly8g#F8qN z-Metrv?mV0kRk%KBMf2hH_K_1K{3xS5yn}pOr_LMhQpK0jBIB|58Kjc6d@h)ARBmW z5O@8cBZa|Jh~8$EI6N!1M<2v_db*^Cwgyh5h9tJ*m#VNz%nD1+lWIFQf`)cb!XOar z&$?E6o{SOp)<1&XuRAQVYiKB?Y`|GB6q#FVX9PzR`)L4l2nUp)%e!Z(k>}jn3sB_N z7-1v}#B6b@6vsNp-?$};kKhSvq7^UBRkXEa=>D~2`U`kvVWEzWL*A2XFP+n~G;=AoQ#dzBus0m!!vtr| zETyiGoLBL(r=$^b5+g^ZWaTieaq^mQ;H#hPyO?IZt)H_x;T61G-le#PM|m|CqHhlc z$+t5z6!&gYA14fde>urZWwP{iwQ>h6G&V2&%b=`JgIK;xHA~b>1?Fc&K4?4$FZMxeYE-V+-vA#sKUi-9K zPYMxj1$A;N08_T;z^n^Lkzidrbe#Y*e=5^)P@Nz~`SR+GKDjatOV6PFuum?d(i5Fz z&+;U_YqH+@6DnzR*ce`DESWX?R^7=GyImuz*^TMxH;U^>Bb>-e z(j$xp>7l{V@+cQ0xUv?B<`qaEG=gpH=Of!TvBHr%*vAX)jaJ`40U^b~OD@+Z5I{fx z7yRcb?Y?u)0R{qMet>hfu`{C9v$3&owE_6%2Rde@a19wydRXC`@<2(BJic({D`f#M z?h?=b@i8GO61m+i(2j+p6X2Eek38LHD^~MewgatH{UFwao0|$4koX($HYx^)VqMA4 zcV*eYXaisu*S8X@eQ4gJ2%F}=J_FO~wV}|id)>4SRxI`I<*|3#G2eL$H9Wn^cVxc@xlF%zWw6ZXm zE|rHOQ&sxEkt&9aGH$J6zVnEG+Lpu(#+^U1@)UC7u694&H zEIe>6{#*xYOM8Q^f)G_<={ zt-G}63R!sydvbl%Tiignrd2__`dlH#RdmqV1P@9UfjND6x;HTdaPSGVhT9Blzc0JSWblWxJ%85EYPOU?to{QXt^TzIu@Unh6sBD{ z&)7lbfRC2~=Nli}C}KPbp;z{WLh?`xBIT`+Yj3(#e=8_gsknYy2` z40|gFcj!J&t%nh4w<8a0|JFS1&da;KFMX4Pb_xSn8DH5iP-wTZ2gv{DD`Wl$2#6l& z7a6wif%hK)GGLe7ro}AH93HqTmWq0}ISqtsi|fFAWFk0S4sl*%N1*n;5+Htv0^qYW z>PPBaYKg@1#N?bbf!Ak^8HUclm(nO<&IgTd&L-AHQj3sOu%7p;Ae?t>b#C467`F?4(QZh& z77mLE4yGy9l%a@=Kgif^BD;HniJ|~U;-xDB-2CxK*67pVSOrXh2*~I35>%*+7S4$+ zgt#b7;frg%3(9#NtOc$5CbP}?>rPA0vZC0{kSdyNUmuh?AsP~slTC}VIt;n_x1~zL z2AMXiRgw@Ao^e-~xHMy7TpQNlf`tZ!ew6ZN0bJ3|6cie-Hw=U!7ES1V;4W9r)uujm zPpldklP3}i%YYsw`6cgozJ%r;3c7-{cAzT64tlqsrOiLDVKy}r7X#d;R_!wx@bu?zf^2ZaEM2` z82jW-bv`N2u@7Nm#`z$lY)dT5LOd->dP(Dxn~@A9te-HUs=Yt)rJ?D7#C^Gn1aej0ehH(oa*II%X&K%_$4$)A7z{l3Gcc@$L8nME&*h_} z5C=DVjIFvb{IHkMDvW`X(KC^8vs2m-JRmKwq%|2`nQ1_yic5l%pPsdkSUvR{f*oG( z1Z#csD9BKdQfa$PuKV=kkiF?X`IAsC3I9ly45B^_o8hPQPO9O?CabowGPAyn z&;`^vIysNjt*7$IEYy--t4mYI}D> zY}4io_a01VbSuu)JbnSo@AM`)%=2(mC`gv|0ER6#vDqZs+d1idOM3`H?Kh)R4(g6g zI1CeCga*NP`Q9|vg%nz+*gIllFs1mfgH|8mhVF?8+<0y3U0-CrBhb90Hd*7#c=}<<>A*gIhuKmdPM)yh^WD&%`OdFlgljXKKdqqA8;f^RG&0Qbd=4xneLY0WD~j zZ;MA=Qtw<&ra)0HU}$SXQKf`Xp&d$fSrwVxMP2h%W@bJ(VjUYfkfc^3FowEoH1jPD~opZZus2!}S4NQ#;%&i?C>PkbDrR{HB zDYs}d$Xo8hdHPSbp)!%7t96l2FX;<8EoCtgGy`+QU$^=jR*u~)R09c(g&2=UQrn&D zEzVZMBsVk^RMwu%>)R4o1*eL#2Xns}{0OleJ!XK=8py`N5FRezn2D2tD~ z7V=m>xJiRr2FQNVECu*bC81E*E9d;F61#k6wPJ`occNBOkd~+j-t1e&*3WVQAdfPj z%;N3JMEN8`gVPybJ$|-!SU&OsyI@u)a~kXkhr|pJluTqw^ic70(va=LVpz@Oz{fdh zi@32+#$CFQkA!laC&FhI*QcE^_|?X?r5zYdO-rI>`Y_vo$B@na)?V&Htb+ECn18mS z#e8nG+wOaaNXMWs@kZ~2jITO(ILQ(IqeWS779N82CL$e6Hyfr!YTV=WNSszSp=x2; z5f0AG%{P=l>lV@2kFd2(E}mo64JFb(!cZJNhpKq3d05^U9twTo_eu7M{NwOrc}4?| zjBZkjTH5@Lb$%;2q7=Q|n8Y#ZeFHf}AZACqd`pMox7CDTpW3Jh%(wR*Uz)O z0VZ0?09X-L!uVitxzkio@X7i~VE1QG#Tc%12n+IH<_VoODl{B@B&u0cOi)`V^wO>f zh`?&JEAs#zyvFd4Lv`l!q}RMU)R@vMyP8*xF8q1rD|=|g9mgm6#2$9+DnYMm+HkQ2 z?caL8Of?h&)oXYc3LzCQo!q_pt;}fM4x_+F5*QoKKhv+l$p&;R>%Hqua0Mgf6J!~u zPu?3|gB|_Y-yYjVWySU<%?_Y&Cp@RsWHP(a^JZ2eP!N`bTe+Ses1DzJ9*539;TK{C z4p+Ld!u~#sZdN!rQyRl!8ePkxMY(hyD1w5^;v>wNd3sa%g`$h2E@J3JdhL-ZsLx2* zfqix3(*QZPR?w%>uPJXN`tU7Xil~|;7kVPn{cV_?=0QhPIf!258}gpk47*IOf=wQ3 z8_w!Ad^ig1ZIHK5aZ;An6V0mi7gPyiuyM^{9@^2xR+?1e{$QJcvM4e7g~!_HLx}X) zIfVUOb|Fw~RqR!yaQFhBH?4a_Ep%12$PBLa7)&76mnU)~zQ$+&DQp+L{0(pF{C%ii^vyRzhSeV!1NQ+i=glg~4$Dvg-b$w ztJbijRdMCGww~$6?I>1(lxz_bq&&1Vqo9<}*xQ|B%Z$sb_xM_4L|>jmDd#GW<86YU;UT8h+TMZM(Z*2E!RUbp7U}^x zBriHxu;H6zeP10EOM8Wz8~cL|mbC4$$GVcHf>i2I{l=HKf(N_`vfX^u zXxa#>dYQ6f#ad5wx>-)kGUrO5tbaCo*yzM?AfD7jb>y;l584%$*?iR%-wP+Ai+l*MVv*TMM;Sap=1$y z)vSj&=3R}lVLxufN5n~I;LhMy%eQ(Dy@Tb6!SAGICAVEd&>ARDE`6P*%c1x(U$hgH zz|sCfUXcvd&W5Q#Kh$LQ&7Jig) za44N(vA2i=ApRcV+pvyp&n_zv%4Nz6S-@n*;DBw|TGt0o6M}li7td7m4!=Ejb9Sx2 zwz=NgfBt!Y^@*tJzAP$I!k+WkrZCOb+*3r zBAf#Q0ReRD=URt%8wfpU|J~*b?2HVI%$y#&{~Vw70oqHrx^?Jl5Z(DSdf!rr1%RH6 zDibu~D_PaOl}6?aoQDv&;&kfGfV7>Ncwy4TILm=m6osSSqh8?43C9soMgch*3umF6 z-`k22r7t+G?$sk_S?H6S$nfzo7iIK`ba`*J$}kn6SSY zg@0p>(9DcfsC_(W>*j$SyINY@4T=-}xK|IwqlDnIG_mdb3>#0mYAh;q{aSrofcc2% z_!1+2yO4n8`uRcTdlNqbYRWBr@DJ;&O$W7MBazm|K?R9% zN+(1w1UB+MZd%lqF$Ie<@p4^x^uM~1xyk918ezYtf|jEYb*qgSFRn;5twj(yGs_yM zHu7`O8a=2itq9K@#Zxzq!6Y#7t@1;Pj+TV%vAM2S4;GJt>P#nsydsm7d@^dFgmT3- zk*AwW>|$Dqe?eaUvUZTFKU+RJk~dSCYIf4NuCj=p?1?K4vgrO=HpS3HJh7rckbrvb z0An{ZTY68&$ar+AdSKf;dM=}xDK+eE!Tp3pI!4&>Ml_FFNlssGTp_g(o5z#F|uMRV1NtwbByco9GonmTdzrO7c zu~?X=d(oe*n!^?3SsCTw;p${7=*p0P9VR2ToK{DfYgIauzd5j8eZUnmvOl zC>?#W73F;zd`yl(%huNJ7TGp7uISal4h2fXFyB)JNx~J#QnwHv&zQ&vvA3*z#7B6} z@SY$nO2=;U`C?ICCYp`$@G$ACUd@N>!(7l5QOlZf`+qd9*x)A zU}7a?a4nM`2i{N+gWzv@lg`FFly`Qn!E?RD%g4nV{kRiXlYWk6ADpWGq}V8p$F3w> z!&=~x0fYsFQ^TVx^Qq_Ilw#C!#+#B(+jOHx*jev6KcX+a7n79`84;OyYWbvZTJ^&d z=V9DjhDA-r_R|p=o2t?;!)qpo>`}wq^ZXT6tk`E4Xw$PpIA>Mc98NaZj=22^V*ZED z+gAC7%%k~@Z(e+5n8N$eH=9tG?LCg_&w!|Wd2AZ7@U#L)p7wLr*%i|@1z1MaQkmAE))LyYxpQE;v^F|xAM5U4%~HB~Fk`y1w{#q0 zP5HsXah$z!f+ih8%=*f=ZOX1Z;9fi`MtuGCc;(YI3u#4VRH>dR^r?}?{*+;ggH(yr22C;5T(dy_3$vKnNC$$AA(wL_qmpkJWgT)UC zq=S8=-^#Wg`%JIhXV~in>_LS>o}q!RfX4NVyCy{#kvogkh{~B1q8P6WW*1_tdzMn= z$y_mkgU@RQFo`1~L`hFIo+u171md`uQeiA`qO!8ucAR74#@Qw;!bb>w`D!ZdljzED zj;XTYsFabp<}q{Rl3Q}kIgC+`oSzvMEbvszQTu6q2O0P4S|VzqA7K&1PK8kg%E=D5 znB#YM?vcs7P6&-d9()&KOV^n-<_O;}p3jKpBi}WOp2k|#2?h^6*^h<@*84ykS1RJ`l~p`N@+9TZTM0vRkF1s!czwvPu5!p^kqcFVKxH zDIMCl=R4h7CQ~nNzBle4f#v_fPm7*xV|_%9siQ>aO>c$2$jPg%Kog(2WkZz5iY}jQ zH#MZOdN)%>anO5RwIO$c2o%Xm7I!>FrAZ>2uO1C^hN?}U#NUJ!#RcL*+8jIco4CC( z<{Ua3c1S*E^{d!>r_Gg}=-9NkaZT7pGv5~H^6fRl@WAE{feFp_jU>=4AB#Ot;9!@H zC*{>#(ijd4;$rWRW}_}^Cev3R5?qOhSD=v5tE|(*kxz$Ux+9IV@>P>xoWUrexg?|w zWcyh4dmK^Ugs(_+oNpUPaBy$PP&mJyU-y{x}bb>Bg&Suh2{bru)MMr`f#P27P? zSw~x8r@RO)4tdiWu$Ch8C9pcHx66U&ns-?CS5#eI;V*2^uR{68YiU!0EcTr(!P*4B9YWY@3)rTj_54%AIWnqGRPNSN`o93J_iR?5fHw-WFd0F~rZKi`3iL z<3Sm*+vvj5$j;;+7MI+^WMzHm5IPU8QDo`dkZRl*<6|2!_p@90u+l6!y!$SAS2vn5g7@sDw(X}(n=@3&@@*e4EQpp>AX%r&<+=wWCD^ulOk^vp=v<^CM&ej zz^4g+D3+G&D1+VwXtPJ(h zvAFSf%-Cr%7H~|H?lG`veYxPpmx%;F#885=0c?r zt5p_+m!VDi`RMFe0Z-R1pWuOyc3FmGo3uFEo-2CCC_h1r8-E+6Q0;uPfKT3OyK+** z2L~1^ZtG<16vJ`7u1`GSTNtqxWrcE-qCYk+6+L-wXitbGDl?`$UWO1V{)|vld*oTp zia^5tJ0rrGP}9yV7FeRbyzuAJqa(`DDhZ0wzb>D#T8^ONAvD{&B86!PjuuS!P@ zlWYo!ks(svJ{Sbn4AIsSs%%CG4%6x0j{<7^;^tZSp1F=%L70v0M<{dH8mt3TgE0 z$u@$-Q>(o*5fF_6U&To+a2%m9ILvdnc+na?l^y81PwIIi%{)A!N7rvsNLi~hp6L!s zQy+_Y(AOu)eNo%0(={+~Nik$(!F~E!Gl32~ykxAg_KT}bP1Xk59jPcwI9^E$ zG=;AW9xd90hmfXc)LEeL(WCEWO;gN6E?=zF(5w{rzWAmYb=%4Hu_iZ+;tl^^=2f-b zwCF)i-(}Ngx3a#eN?}2ey%=0!xs}R!_s|!eaaMY>CCabVp!>k#|PJlga(OG~;Cnqn>y zlagFhkC{<;A$3|>)`Vie4FXNBFC1mJ@>iQdn~_m&eiG*RhQNDNLAyv8P6pj8ab?u% zk<#oie2^XGRoWPExz~yJZMC`Y1aU_$@J;Inut#fIe&_Bzq+g(>V&G4n0^zV`KwuJQ zovgk#wbRCS_tr$j${0Y6Rt=eYs;w2$e0GM}%;^RLcH`G@Vrw4VEagsT_5SnWwFk?M zQEjX&iMaOD8c>LrF_W(|cL}pzp_EZ^TkE!R@}AQ34Hfy7AZOt=XS9Jnn?n7PSAsLI z)Bl9knL7y7+Pj~&vbTB~V`MnNd-w~w1x77-%Eu!=65Q+{oAnuR5b_>gfK`#1PnI0I zJ^TY~$Iq?G_-z&BAw93R=SCa7|27T$gY=WH;)wh#9l}eirWG3opYRVfdm3>Le)1BG z3KIUA))-#;DF&z-cElLa?cL`@>}IKPERhhRXZw45R&(=`F#;snn+}{>Y;RvTyC}Vb z^p&sHm8X6Vqs(7ex{lfU1Tr^|LN85`GG`dQsV%nTeFGJ}_a{&x$~r8Zz$NYJwhW?T z1jYSgdpQmYtF)!AB!ng-OAOKQ{4&LJ&k;Qa*ylbG=J7!I7BJgbba^A>T|ngoyL={& z8RkX^KDG(EA_`7vjT8bhAhoP`8|h4X5Rc`>&jQpvOXpaOg@&m)1CnT?t71DB`Y=Kw zUT&dfxrJJRVK%~jBZ^JMd){c|8K7j0+KJs=(&ZU$HWLmEHXg7OEVWzR@5}NeN=UzW z9J>fEr0%d#NG2$wBB#2t$RKDH5}InK@aeTCn0Ic(%$Y89ajBz0(gn}5)EYb8`V;+D zL|qPohVY7LP>Zswk?=O+^*qb)ZV0LIQI6H5HwsA2=7JA!k$OvX#)#c;bBmzy%k-w7EMkV(p~ z!3g0Ce(-OKZ==Kwu*pJEi@|^YMTVA^hUE6-b@KAa?D&(#9H%<2G;Om^(r9bK;K}En zk(%?WF>%a{>zfi0EfD0Rh(!=W?rZhPgkdp>*{6yTvHA;9RQ4t=!G+G6DyCB!@TUiKO0fDst|oCPY;C-SUg6$LBE z3uvLonJk{y?wc0`@cwi-?K@g4%U`CYIiLE#FmXA4m7?Vhc7w25!k)5`2>?n=(xN88 z9cmg}db2GBLUbZ+$ITl6zc>VOq(|$j{Mwn0S7N(KW7-yKgZ7${x;$PRBuAVPIpQrf z`KXbb1cRAJMc;Oj#vlQ63U|{9C-lMNgEQX#`QxaXGcQ4ysqtkSlLoC9SPmZFyV&-S zS?D%zQ#620_4Dl}@E8b)8tCu)bN*4P`?jqd(3|rO(3@i+RH@e^erD#1RfmFosn=4H zl@dshYSgG;zD%AvK(bvDe*=n1vpl_Jxgy<$1~!`O z(@Ju;WXytZ#S;=zs0^c2;Ig_@O_RxvbCyOLEZlo=Qhq@7CpU0UJ%_eH~N+#nCa zbQ>T$1O|kZxgUF}?NsPSSMK?M1((Cdy{}m7ha`@Sk$e9b%w_o8tj92(cf$QuLbEZX zv8M9l>#@|RmuaxHc^_8=V^ihE=Z;HoUV|}y$!{!KJIw_5MuQ@tT&AkuEI!Tvo}cP6)a?pmZWuEXJjU8 zwLeRGXmnI=M5X|uOI4aVSfBj$fPz7Fs&YoEfy_t`??_BtmGocaA{h6uz?>bMC4&|&Tw9nWS=Vg#RhyZeT1MkN zsxrfl#3MSsVP!yxklCq7w}r^#Dq47kh#!W$%L^sh-6)Ocu(2%|fW=+|DO!|WgP#Ay z85fa0)O|EWHP|GCf-NejgQKynqi@Mv71)A1UkB?t229Je$elCRBehF_n9!A!mhaE8 zBn=*l=Zst) z2w9Y-Fr^7oEWOfu2dSAEgvUc|Zd_23KIO|OS|c%A?^Q%6FFnWGLUIt_EJ0(=6|^NN znPx9@{IVYgGO2XA>X&61eHhm)D2v|aF?J` zI~=8~-pTFNQ*Apsn)5CRNW7#~L&fbN{()w)`^+!*Ox@p(5r&;772ttJS1GJkW>I?~ zF=jO01U1e+m$}_a3C$*(ubkxzvl3)88IGI7EffTzs)YTKoeip!+#IlMl3LIC#nS8> zZ0CJfmtFzFkD0@(N71+SKY-Qy!yewAaQ<@T2Jru1-evvXuV`jvZ2d3q^BmqzPV!r0 z`+agF7Xu4N!$0oJ0xs?VZX^BV&JcA7KXe-%iqLtoerizOv=eO&lBO%DE#hnnOT^l{ zHUH@l6V~W`zwU-O=nTZ^X;TC8bTd^FX!BHA>oLBN$GbF^W{zFv`z#IlHmptP4CMSM zs9-Z6eCqPjK+;S$K1fwqIgB1?iUC1zEM_MomhcQ6KCI`;fT%8ZG+l0~cPz zYe%(dZF)*XXdeBPtFsarHuMl*6cW6#+mD zzm`7XZv~`pr)PEB-tup6REgh06aN;#(CGGBt(lF3nYGpTYqfVbr+%^kFav#({nFBM zWD?XLXa}i!X!9zR#iV**g-P$BD40onrUeNSLE zx(aeD0tiUfAP8LETbclL{V9?kZ?64bA?=Ke|09kEU0?fg^52W&`{brZdWH{$0Ni1_ z4?%v**)K0Mf8S8|Aa1h+e@Gp`$=%NnchdO0VzI26u% zot9{ta=pXkAF5ibpCIk}u5&eHO7AGXlWCpic?Px(e_FW^^Xln_xJy2&j{@a7~$Pg0HERz zf&WO~o7-=Gzh}&DQ&KK}M~wXutevs0t}cjvSH%-IOQTHep)nk-ICUG{LJKhjViwtiyht-Z&msg;r*9xSOb}Q>9@!;oZtSi5bQFc(w?{c z0C@USy?#_+g5R#Zwf=J>1BZL+3%KL@6NC2W{P%b8E%2{jwD_*Rck|om8JPb|mGZhx z3aCAZiSHnQU({?ybOuIr>{EQU0I zzy7<$2KEAVzk0?mD0$26?THBpG?~{W(LJ)KC_@=Vc;jVfl7=g~6s1yn9ZTa@D>_&E z$s;M}t*cYl+;O^I3>q`|sxvjy{@Z3pQ5v2ISkX^2pM};sK8ugSCLCe(@UC(Zi$$!k zeS6c%NKubCZE&T&`sfM|^6bM?SrkXQuElV?SAz;HGCBuV{g=i!6cmbsL&cfqHy~h= zhg`zPL@+d2niQ89L`bGZ;4l$2m-H;B-S-W^c1qsTnBjv2owG?jDEG>rPC<+{5C zaUqqW`ubVeTD{NkhFYXh=)NnWMzlCQW!tc95*BYkdGm@hTVPOQO;Bn#x+=lN=h(9k z>>F=75)-<~)r$+*vVHAkux?fw_X9TuQz9~Dla~=lJi$U{J1Nx*S@OYQnGTIvgB@wZ zphICKdk?!};Ai>RD!g9gwFFHYDH8Ns3-YQtwhVQlNV3~X$AF&rA5a_e!Z+sNb6ZZ< zSq6X2*TQP(Cf|t`|3okot>T!$l$amoE=ggP6lUL9AOY-H^K~VPLqKNKxy}(3u-A&**D{8*bau$e7A|n$W9V5LQk zF$72`>{7&28VHcLK?37iJ~MQtm~SlsL6M#WiHiTNJpkB*pBs3e+ZbN;f6H2!8UC$j z_^Pv#(%;h3(m@9;f~5NtKlTl5KNpg1WoeBJjeUa|zvL9m0LFxh(-mjEUA}zwW$BuN zWA7lhc?x+vn8#PWkFEg*p6Y$g1IMEYZQJW+`nzt}NZm5lCwR&C3D8 zNqe;8OU0~9ZNPnk3Un;kMwrREQaXIiz*}Upxf9 zWp`QnVUB`=o$1*xj4_&h&j??9g;vLI8pzdCSzjcw!;sO5fpSXcdoqXEHu3TUeKcwG zaH4ZMEAQ;_$dOzHTPneBmQdPW>d4ztiMFo>ZzfgA%44r~S*s|Tl~@6L=@UKNsL?Iu z0P6o#w)=W;`^Db`nike~$A*6u(f|Hn@_q8}q4s}e_P4K>{7634Z`Jrugu7t=-y<>p zL!|%F5#__}%KW}T;Q!0oe*hnHP6-Jv1>|0UOs)f~VNQw7;xhvvJB1N^JosG26RJMO zBSo()=GU;j7=!D^{{}%u38pY-#0S;|d?&?y=*9P!Cx7}dWRcV$tr+X=V|e|NRCyJZ%%kPwjO!8-(p1zv zQj`PagS6CirIpMhj7;Nt9L zK>{dzCJ79J0{ZK>GH?In`?%%@eEiFoGyi}9=n3HM%==GF0%EtjQQ@iZD~NyolI9&> z08qf|mG|Fa1SE^@Mun#c@Gtm!_*<6u!2nM?-hZJCkPW>X6`nD-$%dZ_@!vf3cn1rZ z{zpd-c>e5eT=Q?;w%7d>_IpC-zx&{q;l!Uh@?-D~h>P#WH9tS(?{FUaING1;0zkX} zHVdFtuT|_{KKt<_{Qb+*fX4;y#; z;{QGVKfXQqBmDiFNPlHNg7EL*|G0hhPp*;uh=0FV`LFD25I+?EAKZTW5&nLg?q5}~ zmil4v|DrwmMJEy{P>bLk0>IS$20|@j-eFBu<@5VKMr@-%+y(>R_KVt!k@V_j^ z|9WBns!ha#55fChH-G5C?oOijOL2hm{M{h(4174~|Nq7MyS2C@^M2I}P}{m2M4mOG z50=e8Bl7`O!XK%;U%dJ&l_z5VOH@9f9(PA2KxRMc^^4R8YmacJ*Z(+{0QiLa zC8YZagbGhKiQj=ftju=D;+?wOFYMfRdKI3gx9uOlX7L{ssqP>FQvK1--ZxFx{CrZs zgZ!Qve(0|K8Q5Q4UDDr!{g1;4Kt<_pT=U;N5b@6Z|4(=YRG99@HNV+yHuKkO`DX#* z4*6d?z57*{`zNpp&yW9(fQRJ5f2Ur60?*yJ=0B8qm=yk9EO(#z`_V>)r>g8f!Tl!z z-Q6y}kA43vP~mCx>~FFEiIDFA?^hWB<&3*g;b|iGJHUT~b-$YMSFG({VLiBt@qY$O rJRpC8<|mLx|51y>ydWI@B6YSRz?&6#E+_+oF%X6VXGyd`1N#gBRAZq; literal 0 HcmV?d00001 diff --git a/python_blog_scraper.zip b/python_blog_scraper.zip new file mode 100644 index 0000000000000000000000000000000000000000..ab98b3d49955b5f587ee97c42aab87b81748600d GIT binary patch literal 2784 zcmbVO2{e@J8=o=uQDliq8L~5rLAEgVWo+pVrHL#vvJS(D5g}Ak9duQ4Ye=$Xiir^A zPRdLOSt9#RDoeyIxikLxb2>Ta-ury#eV^|;-#O3o{GNCD{T$FBPEi2cTq=i-tiAl0 zL;(;$P(XfheuNxk zg`dBYANUb?xwdxjyAiA`(foycE*=rQIw14~(mAlye4-O_?E1Ee)azvM+(qjo_r$`R z84@>-0QvTl01jv#-jHUS_-Ga;Hm(iC+e$VNhXMhBeOn+lT|Tnz+NA2?^;)^rsbnkrl)D zDWyBVhvszk_pO{?=u=VPJDf3~j5u3OE$Rc7wJ* zy|TjxLaz32SzVO1rL2l%O68MEYSYMrx57WyntYJT0`rt2DibeCEL$D*9e>hsHhFZe zUDzPak1DUXhz)!mc3f<9EYm8?kloIrD!Z;>Rx8;}+i2%NX4n6>q%G^O>=x}L;(|#& zL|gzaFf;@f8XmgV)Tq~FJg0a(p6!2Xs)}NLbe5P*qzfOo>}b9m&J*b#-Ly`k0q|S9e?vkX`z#gtXtR zSjDjm+$bG&Z(}D!3jzR&Ta^Cegq^~9{5W8+poq2dVykZ2wdF_nnUs-W?E) z{z8Qa2&4k+t=@PEc6}c3>cgv|U5@Q>;f#f#X^ohjm{ea6fv=zRo>C^b>4zWTqwDRoj4vOj@{t3=4v+V3hBP*fb{bz77SxHr3Rf6+*v+Y&*rMYV7jeYtrN3Vrxuo8e#) z_dpD4dg7FUcR@C{D1u(=>s2Q@rEe&aooiD&I2qdg5lZO1h!I#F(#v6_v3jQ@BCCS1 zFnA0q=9S@JD$iODPK>FFWL6HWUhn@_b7cSO9s`qJH-+-OQ!~bCx2PUSolDJNZIZUo zpy0DeDpAde2&N95Eci-TIs0I ze`as&CR~jaH>>K3Ysoxqlw<0s){}~~Z_S?3f4hKJRG~D=r6_!eLT4E_fGYu`fhFfM zALK}$RE+9>cY6V|AWc<~=<4pi26B`|9`T{+c_q&)L#rV@?paMsScIdQT46Bafp9J+ z!6^UH+r2*GC7fa!E=SMGCp;9cSEmcpR537 zNU^!YDAbcI2*7|1gD&a9Q69kWB7Bu1)0QPu>60febtAB^`Vf7}xXH_R+iQ&8n7A$} z$lX2TrUUd7lI2L(2VY=<_r=97C%&=O#GVS$nWi7J23GF4YA)RAeU9OBAd#^(MU^qy=kS%!q!TK4<1;GEoxbfL5KmKFOX2r+SGY}f#g)h5 zDJg$yc@tj4Yy8ZJ87c3$8e5HPLam95AV%HtyAXTvo*Q&m!%3ez89pz}og>Hna*b2_ z(|CIg1>kwWTm@fkjjIy=Z8iI2vTFm?&0m?1(BHo2kiV%UNi_GB5;dWJx{;Y8SZ8K* zKtCf2qkal>x<_OJ$pu?+Qa~9}F20fW@c~}aOu-IV88PjJLcZ>usTN&Qo;>nVCUMk) z%0V}n|Cp1WVW*P`Xqg*-SJgg5u5slw8R7_w(xRhA4jWUHKbfv3Px$plMkbc`(evhb zl`mJjW2y8>QuxCCfwZ2tl7D00?qL-p;8WGNHVFz57ip}Vh=PfR%5)i{VPbUjfJ zbb?LYx3Z|7NIs#H+V;v&T(`9b0h$zkuVcx8?`S|BM(@hw??3M<6`t!x!<3p=#1Rj+ z1~!^{uW**`-ZN-9nZHBev|PbGIliP5(RRXJNoigMD^oQGmwDJXs;8?~XAdU;z`i#f z&_E7R(AEltb?L6ziV@rWH@)(|4C{(zGqbZ5%ev8G9kh!Wb@$W^ Date: Wed, 3 Jul 2024 19:54:44 +0530 Subject: [PATCH 3/6] Code for scraping Python blog page --- Homework/DataEngineering-Workshop1 | 1 - 1 file changed, 1 deletion(-) delete mode 160000 Homework/DataEngineering-Workshop1 diff --git a/Homework/DataEngineering-Workshop1 b/Homework/DataEngineering-Workshop1 deleted file mode 160000 index a90eaa9..0000000 --- a/Homework/DataEngineering-Workshop1 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a90eaa9d64f2a2a98070172efd16d8eef824400e From c030f125d4ead0f879be92ec619ea58342eba52c Mon Sep 17 00:00:00 2001 From: MrPrajwal12 Date: Wed, 3 Jul 2024 20:36:07 +0530 Subject: [PATCH 4/6] Code for Scaping Python blog page --- SJEC_session1_CS106_Prajwal/Dockerfile | 12 ++ .../docker-compose.yml | 9 ++ .../python_web_scrape.py | 121 ++++++++++++++++++ SJEC_session1_CS106_Prajwal/requirements.txt | 4 + 4 files changed, 146 insertions(+) create mode 100644 SJEC_session1_CS106_Prajwal/Dockerfile create mode 100644 SJEC_session1_CS106_Prajwal/docker-compose.yml create mode 100644 SJEC_session1_CS106_Prajwal/python_web_scrape.py create mode 100644 SJEC_session1_CS106_Prajwal/requirements.txt diff --git a/SJEC_session1_CS106_Prajwal/Dockerfile b/SJEC_session1_CS106_Prajwal/Dockerfile new file mode 100644 index 0000000..7b1479e --- /dev/null +++ b/SJEC_session1_CS106_Prajwal/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10.2-alpine3.15 +# Create directories +RUN mkdir -p /root/workspace/src +COPY ./python_web_scrape.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib +RUN pip install psycopg2-binary +CMD ["python_web_scrape.py"] +ENTRYPOINT ["python"] \ No newline at end of file diff --git a/SJEC_session1_CS106_Prajwal/docker-compose.yml b/SJEC_session1_CS106_Prajwal/docker-compose.yml new file mode 100644 index 0000000..0876790 --- /dev/null +++ b/SJEC_session1_CS106_Prajwal/docker-compose.yml @@ -0,0 +1,9 @@ +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' \ No newline at end of file diff --git a/SJEC_session1_CS106_Prajwal/python_web_scrape.py b/SJEC_session1_CS106_Prajwal/python_web_scrape.py new file mode 100644 index 0000000..5316336 --- /dev/null +++ b/SJEC_session1_CS106_Prajwal/python_web_scrape.py @@ -0,0 +1,121 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +from psycopg2 import Error + +url = 'https://blog.python.org/' + +def create_connection(db_name, db_user, db_password, db_host, db_port): + try: + connection = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + print("Connection to PostgreSQL DB successful") + return connection + except Error as e: + print(f"The error '{e}' occurred") + return None + +def execute_query(connection, data): + cursor = connection.cursor() + try: + query = """ + INSERT INTO python_blog_articles (date, title, body, author) + VALUES (%s, %s, %s, %s) + """ + cursor.execute(query, data) + connection.commit() + print("Query executed successfully") + except Error as e: + print(f"The error '{e}' occurred") + +def create_table(connection): + cursor = connection.cursor() + try: + create_table_query = """ + CREATE TABLE IF NOT EXISTS python_blog_articles ( + id SERIAL PRIMARY KEY, + date VARCHAR(100), + title TEXT, + body TEXT, + author VARCHAR(100) + ); + """ + cursor.execute(create_table_query) + connection.commit() + print("Table created successfully or already exists") + except Error as e: + print(f"The error '{e}' occurred") + +def process_page(soup, date, titletext, bodytext, author): + for div in soup.find_all('div', class_='date-outer'): + date_header = div.find('h2', class_='date-header') + if date_header: + date_text = date_header.find('span').get_text(strip=True) + date.append(date_text) + + for post in div.find_all('div', class_='post-outer'): + title_head = post.find('h3', class_='post-title entry-title') + if title_head: + titletext.append(title_head.text.strip()) + + content_div = post.find('div', class_='post-body entry-content') + if content_div: + paragraph_text = ' '.join([p.text.strip() for p in content_div.find_all('p')]) + bodytext.append(paragraph_text) + + footer_head = post.find('div', class_='post-footer') + if footer_head: + footer_text = footer_head.find('span', class_='post-author vcard').text.strip() + author.append(footer_text) + +def main(): + db_name = 'webdemo' + db_user = 'postgres' + db_password = '123456' + db_host = 'localhost' + db_port = '5434' + + connection = create_connection(db_name, db_user, db_password, db_host, db_port) + + if connection: + try: + date = [] + titletext = [] + bodytext = [] + author = [] + + res = requests.get(url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup, date, titletext, bodytext, author) + + while len(titletext) < 50: + older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE)) + if older_posts_link: + next_page_url = older_posts_link['href'] + res = requests.get(next_page_url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup, date, titletext, bodytext, author) + else: + break + + create_table(connection) + for i in range(len(titletext)): + data = (date[i], titletext[i], bodytext[i], author[i]) + execute_query(connection, data) + + except Error as e: + print(f"Error: {e}") + + finally: + if connection: + connection.close() + print("PostgreSQL connection is closed") + +if __name__ == "__main__": + main() diff --git a/SJEC_session1_CS106_Prajwal/requirements.txt b/SJEC_session1_CS106_Prajwal/requirements.txt new file mode 100644 index 0000000..02a89c4 --- /dev/null +++ b/SJEC_session1_CS106_Prajwal/requirements.txt @@ -0,0 +1,4 @@ +psycopg2-binary==2.9.3 +beautifulsoup4==4.11.1 +requests==2.27.1 +html5lib==1.1 From 4873156a579cef227e8f481fd2d8791869ad9e2c Mon Sep 17 00:00:00 2001 From: MrPrajwal12 Date: Wed, 3 Jul 2024 20:43:50 +0530 Subject: [PATCH 5/6] Remove unnecessary zip files --- SJEC_CS106_workshop1.zip | Bin 27541 -> 0 bytes python_blog_scraper.zip | Bin 2784 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 SJEC_CS106_workshop1.zip delete mode 100644 python_blog_scraper.zip diff --git a/SJEC_CS106_workshop1.zip b/SJEC_CS106_workshop1.zip deleted file mode 100644 index 76e2e0040a5ac5435853cdaf4d3a0aedce41f411..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27541 zcmdqJbyOBw(>F{?OG`<2cXu~Pceiwd2udR@CEXz1-HmjYbV#>^Ajr$*xw#L=Pd%LT ze9s^6yVrtC*Tveunc1^vX3xyt&m=*?P=EkKw|+(S`#=8k3l#_tNI^naP)AUKo{mMw z+1k$B-qhNLo>oZ_4hX`6*T*>h_6P9k=mG--47Ldh1oZvav+q+v0zm=|m;O0r_3gBz zznap}+Q8h%j@rQ5(#G1}h{n~@;yZ{f%^v_i0ih(L=^v36FI$m+_Mu-|O14t|8SMvJ z8gW_)>M8~%IS6Rdjg4p`I7lK;&o4hA9z8)77>dvl z=$Rx0r2VjzkH;-j0LK1g8LsS4mK}hBfN+1sve0*yjm<2KzO%aeYgQe@T4mbl5QGlC z1uDI80`b+Z$;K1T5jP5a=9|}RtNi8-7Hw5N*{d*^oOe~-NpaPjUkUP1hfz23C(8tYTh-P>%8ZAa{ z&WNN_Z;%sla~;cCBVQ~hyPUKN?%}4vK$O$f`30e0h^sx{Xqfa4{&YNn#1m;=Ou~7l zY1QGRh5BhdQQV0aAkTt&nVHpFSpuZ?r#$)I|0z!ocpxD1U&+(P)xp%-O2^qqU&r3S zPS3`O#>Vx#WIdn$Qi%#xG^}S?P(03y@nB&ifMmtT4B=-^mJAn+sU+1ku++gk@gyq- zl%6O>JlBW)dcfHrMnFtF9S;R5aCE5I946Ym_Dqfhp*_XCA3VhZgF6+DS5ye5$VMHe ztT2eVFdxcHhBZGBmr;FhutNLuBpEGwh6O{enkT{9o^Q^VPdz=^sr4lsf!x%By-uVH zh7qgOz34LFG7Rc5Y{)(_9!=uCC$CK)pD=ezQRhW{7VWh=EqkxIsbzzvDTBb}g?%c( zlf)d53PsCQ@8KrJ<4Q3q-D^8u z)w;ap@Nu8odMZc>_GX1MvnTRacI3!JlysszHK0Y11khiq(=KfNUe~+z)-Bz{E?zaM z65e3(2+r$}R%34;3PdIs>oc&248yQFvLP(Px40GC(JR}*6t9@(U2$U-E-!6*I(Bw|S+n-xm3J3qHKk=c786Qquy# zgK`1((q!=AySct0lcT4&=9B!Wmm>ICdCrn1phZ?d2dR+(O}Ku>Yb5a{f2NQ;{agft z>}dKS_w=bT8D;7K@mtb8cgZw?ewaF-PgZ6Ai6xm&P08FBXH!mdB z3{{7`D^|80v!xsIkxsL3fo_4qXe%^pqKSV@Q2Prhan>sxc3^V^B9Itvm^ZyJI9{F? zeq++in$9-tyX_hhz;%dGB1lsv{ypj}em$6JJ>wkDKUZyhU@D+h>$O{|(2DSMcdzQ1 zI5w_p!S}7hG>)$oQNBX7wtJbXT~7NRvsPzo#;e-PIQ4BCtBLgxZIijMUHscw@PSc* zCI<<-Gti#Si&=YFw7Ix;ZQd=jjCivy5f!%$^K5~L#Yo6`URc^3;B3<8Y)nz;7UNBX zuhUzt7)AO-*4n3cKu8kEpW2B$$3+orKZr|w3*RZs&%zz=uEgjJ`f?6=lL4IOTA7rO zH#KMs-^>g+g!!%Jk~n{_EQ0-4?88QsFBDs<`viWUM2kV%c|PxF#>C68CjyNeZ_L(D zth{K_1`}KXadKyE=UY<{NToOsx(VJipLcWft6pf&IfkSNZPe2&Zr^frIW1iBT0Kg) zSbF6CA}xiN@$uOUTtOixm6#ZWtiE8~O%<7@XQ-z$4W#hOG~0#ZqK%uuy6h3-eY|KK zqfK<6TSzC9w z4{uL->*Y&te*m99_4a^y*?*W9@3-@sT3egj|1mr1e`W`K{&aRgEOCE!C!|}n2h474 zZ)t61=3s3{?X2fuU}~vnMPsjLY4abK3xK=(`2n_2#nK*`72%u3P2i`)V9TyE@XIWN z$O`ODn54&sF62r0OK%ShHP)!Kh_r%9nVTEyDU)QIdfQa_7-{=*ko>l2hVf^ft4w|mq@c2JI@a~4n#M|7 ztys@K_Qj!Uw`nYwUkI3@9P~HR&eKW1Cx}E=MDLGEQHk_q*kTCG(o$U^Sb98Tn-#^b zNW3=M*H4I4k?{%1XHZYij#;UOSIO7Gd)W!eGWlV)zSt8mRU0Y2U+SCTt1?rjy_-=}&#VoR8M+ui7j@%FSPcjRcSd0>NBq zy2J)aLLtAI#Tu5`f^|eVD7pla@lD}WRUFKSdjm!Zvphk(S6;!~hUhfcvBvf4vuJQAaBgR2sy#g)N)8TAY9~Pvf*}W4ZBC}v=}rOB2pj)% zh355#WRUS6$lsZWjev1n+)V=G97P{P>qW$fX z^xY&@Qd}p9z3^7cu58{6m0@#-gNw`Q<<=0bj=(5~*T+pbmLZ3>$>e?`r$I!RO3w+i zT*#eX)4Z)GPqU+XSMz2*MEk6TI4P-NO7g>LTVflVSE|>u4C5@-(wVXV-4HkHl0OS2 z@xs&nWGJ8@3HJ!pui8r9R2{vJKFhwpmdh^=c9D#IFnZ5Hlz&jdTBYWO&f}GXNLb(J znH+LjVg8)LhMcJA!>{>hL;mf}+9B~FAt)3iryR-ie3a&?Kna2aV`B--va(RjI646m zCZBDS4E;9qpfoi|`+UYz-@LHwe@hhd$XC0z@PJ=-Jt=0(?g?hvUYBxR-eJugZ*4Dh zWKyx);VSAYyZ41thE~$R#3*j+h=T7s11UtMCzsQ)2=}|tB(1#J)rsW1Q6RDHo^u@V z+eIwl(tJ~|!Zn2Wq9c$~5*gh!`A*J2^k4Rp4s*kFACPl3c^KlRn8Pyu*beg)B`C??m*Ny!-}H7SormBTOyyxhKGJ7 zMG5~VPQw_iVB+2Awn?a>w`cHXIF!9IjKc&z`Rc5Dj2?pRo5xuL#9qk8b=dHM`4}yt zmriBo2#pLOiWKA|VZP?9rUXDFfKDS1(` zvm;E?gP1h2wZr8D!R%|ou}iz7Y@*qdRA2b=v@2BCMhV8h8iM)zekdXC6{uwo^VbG@ z#*V|06LSa!wXPzW$Qb~6ot^+<6#+-dm|Kp@U`4}QI#kxe{&>^1$YtBC|Ly8g#F8qN z-Metrv?mV0kRk%KBMf2hH_K_1K{3xS5yn}pOr_LMhQpK0jBIB|58Kjc6d@h)ARBmW z5O@8cBZa|Jh~8$EI6N!1M<2v_db*^Cwgyh5h9tJ*m#VNz%nD1+lWIFQf`)cb!XOar z&$?E6o{SOp)<1&XuRAQVYiKB?Y`|GB6q#FVX9PzR`)L4l2nUp)%e!Z(k>}jn3sB_N z7-1v}#B6b@6vsNp-?$};kKhSvq7^UBRkXEa=>D~2`U`kvVWEzWL*A2XFP+n~G;=AoQ#dzBus0m!!vtr| zETyiGoLBL(r=$^b5+g^ZWaTieaq^mQ;H#hPyO?IZt)H_x;T61G-le#PM|m|CqHhlc z$+t5z6!&gYA14fde>urZWwP{iwQ>h6G&V2&%b=`JgIK;xHA~b>1?Fc&K4?4$FZMxeYE-V+-vA#sKUi-9K zPYMxj1$A;N08_T;z^n^Lkzidrbe#Y*e=5^)P@Nz~`SR+GKDjatOV6PFuum?d(i5Fz z&+;U_YqH+@6DnzR*ce`DESWX?R^7=GyImuz*^TMxH;U^>Bb>-e z(j$xp>7l{V@+cQ0xUv?B<`qaEG=gpH=Of!TvBHr%*vAX)jaJ`40U^b~OD@+Z5I{fx z7yRcb?Y?u)0R{qMet>hfu`{C9v$3&owE_6%2Rde@a19wydRXC`@<2(BJic({D`f#M z?h?=b@i8GO61m+i(2j+p6X2Eek38LHD^~MewgatH{UFwao0|$4koX($HYx^)VqMA4 zcV*eYXaisu*S8X@eQ4gJ2%F}=J_FO~wV}|id)>4SRxI`I<*|3#G2eL$H9Wn^cVxc@xlF%zWw6ZXm zE|rHOQ&sxEkt&9aGH$J6zVnEG+Lpu(#+^U1@)UC7u694&H zEIe>6{#*xYOM8Q^f)G_<={ zt-G}63R!sydvbl%Tiignrd2__`dlH#RdmqV1P@9UfjND6x;HTdaPSGVhT9Blzc0JSWblWxJ%85EYPOU?to{QXt^TzIu@Unh6sBD{ z&)7lbfRC2~=Nli}C}KPbp;z{WLh?`xBIT`+Yj3(#e=8_gsknYy2` z40|gFcj!J&t%nh4w<8a0|JFS1&da;KFMX4Pb_xSn8DH5iP-wTZ2gv{DD`Wl$2#6l& z7a6wif%hK)GGLe7ro}AH93HqTmWq0}ISqtsi|fFAWFk0S4sl*%N1*n;5+Htv0^qYW z>PPBaYKg@1#N?bbf!Ak^8HUclm(nO<&IgTd&L-AHQj3sOu%7p;Ae?t>b#C467`F?4(QZh& z77mLE4yGy9l%a@=Kgif^BD;HniJ|~U;-xDB-2CxK*67pVSOrXh2*~I35>%*+7S4$+ zgt#b7;frg%3(9#NtOc$5CbP}?>rPA0vZC0{kSdyNUmuh?AsP~slTC}VIt;n_x1~zL z2AMXiRgw@Ao^e-~xHMy7TpQNlf`tZ!ew6ZN0bJ3|6cie-Hw=U!7ES1V;4W9r)uujm zPpldklP3}i%YYsw`6cgozJ%r;3c7-{cAzT64tlqsrOiLDVKy}r7X#d;R_!wx@bu?zf^2ZaEM2` z82jW-bv`N2u@7Nm#`z$lY)dT5LOd->dP(Dxn~@A9te-HUs=Yt)rJ?D7#C^Gn1aej0ehH(oa*II%X&K%_$4$)A7z{l3Gcc@$L8nME&*h_} z5C=DVjIFvb{IHkMDvW`X(KC^8vs2m-JRmKwq%|2`nQ1_yic5l%pPsdkSUvR{f*oG( z1Z#csD9BKdQfa$PuKV=kkiF?X`IAsC3I9ly45B^_o8hPQPO9O?CabowGPAyn z&;`^vIysNjt*7$IEYy--t4mYI}D> zY}4io_a01VbSuu)JbnSo@AM`)%=2(mC`gv|0ER6#vDqZs+d1idOM3`H?Kh)R4(g6g zI1CeCga*NP`Q9|vg%nz+*gIllFs1mfgH|8mhVF?8+<0y3U0-CrBhb90Hd*7#c=}<<>A*gIhuKmdPM)yh^WD&%`OdFlgljXKKdqqA8;f^RG&0Qbd=4xneLY0WD~j zZ;MA=Qtw<&ra)0HU}$SXQKf`Xp&d$fSrwVxMP2h%W@bJ(VjUYfkfc^3FowEoH1jPD~opZZus2!}S4NQ#;%&i?C>PkbDrR{HB zDYs}d$Xo8hdHPSbp)!%7t96l2FX;<8EoCtgGy`+QU$^=jR*u~)R09c(g&2=UQrn&D zEzVZMBsVk^RMwu%>)R4o1*eL#2Xns}{0OleJ!XK=8py`N5FRezn2D2tD~ z7V=m>xJiRr2FQNVECu*bC81E*E9d;F61#k6wPJ`occNBOkd~+j-t1e&*3WVQAdfPj z%;N3JMEN8`gVPybJ$|-!SU&OsyI@u)a~kXkhr|pJluTqw^ic70(va=LVpz@Oz{fdh zi@32+#$CFQkA!laC&FhI*QcE^_|?X?r5zYdO-rI>`Y_vo$B@na)?V&Htb+ECn18mS z#e8nG+wOaaNXMWs@kZ~2jITO(ILQ(IqeWS779N82CL$e6Hyfr!YTV=WNSszSp=x2; z5f0AG%{P=l>lV@2kFd2(E}mo64JFb(!cZJNhpKq3d05^U9twTo_eu7M{NwOrc}4?| zjBZkjTH5@Lb$%;2q7=Q|n8Y#ZeFHf}AZACqd`pMox7CDTpW3Jh%(wR*Uz)O z0VZ0?09X-L!uVitxzkio@X7i~VE1QG#Tc%12n+IH<_VoODl{B@B&u0cOi)`V^wO>f zh`?&JEAs#zyvFd4Lv`l!q}RMU)R@vMyP8*xF8q1rD|=|g9mgm6#2$9+DnYMm+HkQ2 z?caL8Of?h&)oXYc3LzCQo!q_pt;}fM4x_+F5*QoKKhv+l$p&;R>%Hqua0Mgf6J!~u zPu?3|gB|_Y-yYjVWySU<%?_Y&Cp@RsWHP(a^JZ2eP!N`bTe+Ses1DzJ9*539;TK{C z4p+Ld!u~#sZdN!rQyRl!8ePkxMY(hyD1w5^;v>wNd3sa%g`$h2E@J3JdhL-ZsLx2* zfqix3(*QZPR?w%>uPJXN`tU7Xil~|;7kVPn{cV_?=0QhPIf!258}gpk47*IOf=wQ3 z8_w!Ad^ig1ZIHK5aZ;An6V0mi7gPyiuyM^{9@^2xR+?1e{$QJcvM4e7g~!_HLx}X) zIfVUOb|Fw~RqR!yaQFhBH?4a_Ep%12$PBLa7)&76mnU)~zQ$+&DQp+L{0(pF{C%ii^vyRzhSeV!1NQ+i=glg~4$Dvg-b$w ztJbijRdMCGww~$6?I>1(lxz_bq&&1Vqo9<}*xQ|B%Z$sb_xM_4L|>jmDd#GW<86YU;UT8h+TMZM(Z*2E!RUbp7U}^x zBriHxu;H6zeP10EOM8Wz8~cL|mbC4$$GVcHf>i2I{l=HKf(N_`vfX^u zXxa#>dYQ6f#ad5wx>-)kGUrO5tbaCo*yzM?AfD7jb>y;l584%$*?iR%-wP+Ai+l*MVv*TMM;Sap=1$y z)vSj&=3R}lVLxufN5n~I;LhMy%eQ(Dy@Tb6!SAGICAVEd&>ARDE`6P*%c1x(U$hgH zz|sCfUXcvd&W5Q#Kh$LQ&7Jig) za44N(vA2i=ApRcV+pvyp&n_zv%4Nz6S-@n*;DBw|TGt0o6M}li7td7m4!=Ejb9Sx2 zwz=NgfBt!Y^@*tJzAP$I!k+WkrZCOb+*3r zBAf#Q0ReRD=URt%8wfpU|J~*b?2HVI%$y#&{~Vw70oqHrx^?Jl5Z(DSdf!rr1%RH6 zDibu~D_PaOl}6?aoQDv&;&kfGfV7>Ncwy4TILm=m6osSSqh8?43C9soMgch*3umF6 z-`k22r7t+G?$sk_S?H6S$nfzo7iIK`ba`*J$}kn6SSY zg@0p>(9DcfsC_(W>*j$SyINY@4T=-}xK|IwqlDnIG_mdb3>#0mYAh;q{aSrofcc2% z_!1+2yO4n8`uRcTdlNqbYRWBr@DJ;&O$W7MBazm|K?R9% zN+(1w1UB+MZd%lqF$Ie<@p4^x^uM~1xyk918ezYtf|jEYb*qgSFRn;5twj(yGs_yM zHu7`O8a=2itq9K@#Zxzq!6Y#7t@1;Pj+TV%vAM2S4;GJt>P#nsydsm7d@^dFgmT3- zk*AwW>|$Dqe?eaUvUZTFKU+RJk~dSCYIf4NuCj=p?1?K4vgrO=HpS3HJh7rckbrvb z0An{ZTY68&$ar+AdSKf;dM=}xDK+eE!Tp3pI!4&>Ml_FFNlssGTp_g(o5z#F|uMRV1NtwbByco9GonmTdzrO7c zu~?X=d(oe*n!^?3SsCTw;p${7=*p0P9VR2ToK{DfYgIauzd5j8eZUnmvOl zC>?#W73F;zd`yl(%huNJ7TGp7uISal4h2fXFyB)JNx~J#QnwHv&zQ&vvA3*z#7B6} z@SY$nO2=;U`C?ICCYp`$@G$ACUd@N>!(7l5QOlZf`+qd9*x)A zU}7a?a4nM`2i{N+gWzv@lg`FFly`Qn!E?RD%g4nV{kRiXlYWk6ADpWGq}V8p$F3w> z!&=~x0fYsFQ^TVx^Qq_Ilw#C!#+#B(+jOHx*jev6KcX+a7n79`84;OyYWbvZTJ^&d z=V9DjhDA-r_R|p=o2t?;!)qpo>`}wq^ZXT6tk`E4Xw$PpIA>Mc98NaZj=22^V*ZED z+gAC7%%k~@Z(e+5n8N$eH=9tG?LCg_&w!|Wd2AZ7@U#L)p7wLr*%i|@1z1MaQkmAE))LyYxpQE;v^F|xAM5U4%~HB~Fk`y1w{#q0 zP5HsXah$z!f+ih8%=*f=ZOX1Z;9fi`MtuGCc;(YI3u#4VRH>dR^r?}?{*+;ggH(yr22C;5T(dy_3$vKnNC$$AA(wL_qmpkJWgT)UC zq=S8=-^#Wg`%JIhXV~in>_LS>o}q!RfX4NVyCy{#kvogkh{~B1q8P6WW*1_tdzMn= z$y_mkgU@RQFo`1~L`hFIo+u171md`uQeiA`qO!8ucAR74#@Qw;!bb>w`D!ZdljzED zj;XTYsFabp<}q{Rl3Q}kIgC+`oSzvMEbvszQTu6q2O0P4S|VzqA7K&1PK8kg%E=D5 znB#YM?vcs7P6&-d9()&KOV^n-<_O;}p3jKpBi}WOp2k|#2?h^6*^h<@*84ykS1RJ`l~p`N@+9TZTM0vRkF1s!czwvPu5!p^kqcFVKxH zDIMCl=R4h7CQ~nNzBle4f#v_fPm7*xV|_%9siQ>aO>c$2$jPg%Kog(2WkZz5iY}jQ zH#MZOdN)%>anO5RwIO$c2o%Xm7I!>FrAZ>2uO1C^hN?}U#NUJ!#RcL*+8jIco4CC( z<{Ua3c1S*E^{d!>r_Gg}=-9NkaZT7pGv5~H^6fRl@WAE{feFp_jU>=4AB#Ot;9!@H zC*{>#(ijd4;$rWRW}_}^Cev3R5?qOhSD=v5tE|(*kxz$Ux+9IV@>P>xoWUrexg?|w zWcyh4dmK^Ugs(_+oNpUPaBy$PP&mJyU-y{x}bb>Bg&Suh2{bru)MMr`f#P27P? zSw~x8r@RO)4tdiWu$Ch8C9pcHx66U&ns-?CS5#eI;V*2^uR{68YiU!0EcTr(!P*4B9YWY@3)rTj_54%AIWnqGRPNSN`o93J_iR?5fHw-WFd0F~rZKi`3iL z<3Sm*+vvj5$j;;+7MI+^WMzHm5IPU8QDo`dkZRl*<6|2!_p@90u+l6!y!$SAS2vn5g7@sDw(X}(n=@3&@@*e4EQpp>AX%r&<+=wWCD^ulOk^vp=v<^CM&ej zz^4g+D3+G&D1+VwXtPJ(h zvAFSf%-Cr%7H~|H?lG`veYxPpmx%;F#885=0c?r zt5p_+m!VDi`RMFe0Z-R1pWuOyc3FmGo3uFEo-2CCC_h1r8-E+6Q0;uPfKT3OyK+** z2L~1^ZtG<16vJ`7u1`GSTNtqxWrcE-qCYk+6+L-wXitbGDl?`$UWO1V{)|vld*oTp zia^5tJ0rrGP}9yV7FeRbyzuAJqa(`DDhZ0wzb>D#T8^ONAvD{&B86!PjuuS!P@ zlWYo!ks(svJ{Sbn4AIsSs%%CG4%6x0j{<7^;^tZSp1F=%L70v0M<{dH8mt3TgE0 z$u@$-Q>(o*5fF_6U&To+a2%m9ILvdnc+na?l^y81PwIIi%{)A!N7rvsNLi~hp6L!s zQy+_Y(AOu)eNo%0(={+~Nik$(!F~E!Gl32~ykxAg_KT}bP1Xk59jPcwI9^E$ zG=;AW9xd90hmfXc)LEeL(WCEWO;gN6E?=zF(5w{rzWAmYb=%4Hu_iZ+;tl^^=2f-b zwCF)i-(}Ngx3a#eN?}2ey%=0!xs}R!_s|!eaaMY>CCabVp!>k#|PJlga(OG~;Cnqn>y zlagFhkC{<;A$3|>)`Vie4FXNBFC1mJ@>iQdn~_m&eiG*RhQNDNLAyv8P6pj8ab?u% zk<#oie2^XGRoWPExz~yJZMC`Y1aU_$@J;Inut#fIe&_Bzq+g(>V&G4n0^zV`KwuJQ zovgk#wbRCS_tr$j${0Y6Rt=eYs;w2$e0GM}%;^RLcH`G@Vrw4VEagsT_5SnWwFk?M zQEjX&iMaOD8c>LrF_W(|cL}pzp_EZ^TkE!R@}AQ34Hfy7AZOt=XS9Jnn?n7PSAsLI z)Bl9knL7y7+Pj~&vbTB~V`MnNd-w~w1x77-%Eu!=65Q+{oAnuR5b_>gfK`#1PnI0I zJ^TY~$Iq?G_-z&BAw93R=SCa7|27T$gY=WH;)wh#9l}eirWG3opYRVfdm3>Le)1BG z3KIUA))-#;DF&z-cElLa?cL`@>}IKPERhhRXZw45R&(=`F#;snn+}{>Y;RvTyC}Vb z^p&sHm8X6Vqs(7ex{lfU1Tr^|LN85`GG`dQsV%nTeFGJ}_a{&x$~r8Zz$NYJwhW?T z1jYSgdpQmYtF)!AB!ng-OAOKQ{4&LJ&k;Qa*ylbG=J7!I7BJgbba^A>T|ngoyL={& z8RkX^KDG(EA_`7vjT8bhAhoP`8|h4X5Rc`>&jQpvOXpaOg@&m)1CnT?t71DB`Y=Kw zUT&dfxrJJRVK%~jBZ^JMd){c|8K7j0+KJs=(&ZU$HWLmEHXg7OEVWzR@5}NeN=UzW z9J>fEr0%d#NG2$wBB#2t$RKDH5}InK@aeTCn0Ic(%$Y89ajBz0(gn}5)EYb8`V;+D zL|qPohVY7LP>Zswk?=O+^*qb)ZV0LIQI6H5HwsA2=7JA!k$OvX#)#c;bBmzy%k-w7EMkV(p~ z!3g0Ce(-OKZ==Kwu*pJEi@|^YMTVA^hUE6-b@KAa?D&(#9H%<2G;Om^(r9bK;K}En zk(%?WF>%a{>zfi0EfD0Rh(!=W?rZhPgkdp>*{6yTvHA;9RQ4t=!G+G6DyCB!@TUiKO0fDst|oCPY;C-SUg6$LBE z3uvLonJk{y?wc0`@cwi-?K@g4%U`CYIiLE#FmXA4m7?Vhc7w25!k)5`2>?n=(xN88 z9cmg}db2GBLUbZ+$ITl6zc>VOq(|$j{Mwn0S7N(KW7-yKgZ7${x;$PRBuAVPIpQrf z`KXbb1cRAJMc;Oj#vlQ63U|{9C-lMNgEQX#`QxaXGcQ4ysqtkSlLoC9SPmZFyV&-S zS?D%zQ#620_4Dl}@E8b)8tCu)bN*4P`?jqd(3|rO(3@i+RH@e^erD#1RfmFosn=4H zl@dshYSgG;zD%AvK(bvDe*=n1vpl_Jxgy<$1~!`O z(@Ju;WXytZ#S;=zs0^c2;Ig_@O_RxvbCyOLEZlo=Qhq@7CpU0UJ%_eH~N+#nCa zbQ>T$1O|kZxgUF}?NsPSSMK?M1((Cdy{}m7ha`@Sk$e9b%w_o8tj92(cf$QuLbEZX zv8M9l>#@|RmuaxHc^_8=V^ihE=Z;HoUV|}y$!{!KJIw_5MuQ@tT&AkuEI!Tvo}cP6)a?pmZWuEXJjU8 zwLeRGXmnI=M5X|uOI4aVSfBj$fPz7Fs&YoEfy_t`??_BtmGocaA{h6uz?>bMC4&|&Tw9nWS=Vg#RhyZeT1MkN zsxrfl#3MSsVP!yxklCq7w}r^#Dq47kh#!W$%L^sh-6)Ocu(2%|fW=+|DO!|WgP#Ay z85fa0)O|EWHP|GCf-NejgQKynqi@Mv71)A1UkB?t229Je$elCRBehF_n9!A!mhaE8 zBn=*l=Zst) z2w9Y-Fr^7oEWOfu2dSAEgvUc|Zd_23KIO|OS|c%A?^Q%6FFnWGLUIt_EJ0(=6|^NN znPx9@{IVYgGO2XA>X&61eHhm)D2v|aF?J` zI~=8~-pTFNQ*Apsn)5CRNW7#~L&fbN{()w)`^+!*Ox@p(5r&;772ttJS1GJkW>I?~ zF=jO01U1e+m$}_a3C$*(ubkxzvl3)88IGI7EffTzs)YTKoeip!+#IlMl3LIC#nS8> zZ0CJfmtFzFkD0@(N71+SKY-Qy!yewAaQ<@T2Jru1-evvXuV`jvZ2d3q^BmqzPV!r0 z`+agF7Xu4N!$0oJ0xs?VZX^BV&JcA7KXe-%iqLtoerizOv=eO&lBO%DE#hnnOT^l{ zHUH@l6V~W`zwU-O=nTZ^X;TC8bTd^FX!BHA>oLBN$GbF^W{zFv`z#IlHmptP4CMSM zs9-Z6eCqPjK+;S$K1fwqIgB1?iUC1zEM_MomhcQ6KCI`;fT%8ZG+l0~cPz zYe%(dZF)*XXdeBPtFsarHuMl*6cW6#+mD zzm`7XZv~`pr)PEB-tup6REgh06aN;#(CGGBt(lF3nYGpTYqfVbr+%^kFav#({nFBM zWD?XLXa}i!X!9zR#iV**g-P$BD40onrUeNSLE zx(aeD0tiUfAP8LETbclL{V9?kZ?64bA?=Ke|09kEU0?fg^52W&`{brZdWH{$0Ni1_ z4?%v**)K0Mf8S8|Aa1h+e@Gp`$=%NnchdO0VzI26u% zot9{ta=pXkAF5ibpCIk}u5&eHO7AGXlWCpic?Px(e_FW^^Xln_xJy2&j{@a7~$Pg0HERz zf&WO~o7-=Gzh}&DQ&KK}M~wXutevs0t}cjvSH%-IOQTHep)nk-ICUG{LJKhjViwtiyht-Z&msg;r*9xSOb}Q>9@!;oZtSi5bQFc(w?{c z0C@USy?#_+g5R#Zwf=J>1BZL+3%KL@6NC2W{P%b8E%2{jwD_*Rck|om8JPb|mGZhx z3aCAZiSHnQU({?ybOuIr>{EQU0I zzy7<$2KEAVzk0?mD0$26?THBpG?~{W(LJ)KC_@=Vc;jVfl7=g~6s1yn9ZTa@D>_&E z$s;M}t*cYl+;O^I3>q`|sxvjy{@Z3pQ5v2ISkX^2pM};sK8ugSCLCe(@UC(Zi$$!k zeS6c%NKubCZE&T&`sfM|^6bM?SrkXQuElV?SAz;HGCBuV{g=i!6cmbsL&cfqHy~h= zhg`zPL@+d2niQ89L`bGZ;4l$2m-H;B-S-W^c1qsTnBjv2owG?jDEG>rPC<+{5C zaUqqW`ubVeTD{NkhFYXh=)NnWMzlCQW!tc95*BYkdGm@hTVPOQO;Bn#x+=lN=h(9k z>>F=75)-<~)r$+*vVHAkux?fw_X9TuQz9~Dla~=lJi$U{J1Nx*S@OYQnGTIvgB@wZ zphICKdk?!};Ai>RD!g9gwFFHYDH8Ns3-YQtwhVQlNV3~X$AF&rA5a_e!Z+sNb6ZZ< zSq6X2*TQP(Cf|t`|3okot>T!$l$amoE=ggP6lUL9AOY-H^K~VPLqKNKxy}(3u-A&**D{8*bau$e7A|n$W9V5LQk zF$72`>{7&28VHcLK?37iJ~MQtm~SlsL6M#WiHiTNJpkB*pBs3e+ZbN;f6H2!8UC$j z_^Pv#(%;h3(m@9;f~5NtKlTl5KNpg1WoeBJjeUa|zvL9m0LFxh(-mjEUA}zwW$BuN zWA7lhc?x+vn8#PWkFEg*p6Y$g1IMEYZQJW+`nzt}NZm5lCwR&C3D8 zNqe;8OU0~9ZNPnk3Un;kMwrREQaXIiz*}Upxf9 zWp`QnVUB`=o$1*xj4_&h&j??9g;vLI8pzdCSzjcw!;sO5fpSXcdoqXEHu3TUeKcwG zaH4ZMEAQ;_$dOzHTPneBmQdPW>d4ztiMFo>ZzfgA%44r~S*s|Tl~@6L=@UKNsL?Iu z0P6o#w)=W;`^Db`nike~$A*6u(f|Hn@_q8}q4s}e_P4K>{7634Z`Jrugu7t=-y<>p zL!|%F5#__}%KW}T;Q!0oe*hnHP6-Jv1>|0UOs)f~VNQw7;xhvvJB1N^JosG26RJMO zBSo()=GU;j7=!D^{{}%u38pY-#0S;|d?&?y=*9P!Cx7}dWRcV$tr+X=V|e|NRCyJZ%%kPwjO!8-(p1zv zQj`PagS6CirIpMhj7;Nt9L zK>{dzCJ79J0{ZK>GH?In`?%%@eEiFoGyi}9=n3HM%==GF0%EtjQQ@iZD~NyolI9&> z08qf|mG|Fa1SE^@Mun#c@Gtm!_*<6u!2nM?-hZJCkPW>X6`nD-$%dZ_@!vf3cn1rZ z{zpd-c>e5eT=Q?;w%7d>_IpC-zx&{q;l!Uh@?-D~h>P#WH9tS(?{FUaING1;0zkX} zHVdFtuT|_{KKt<_{Qb+*fX4;y#; z;{QGVKfXQqBmDiFNPlHNg7EL*|G0hhPp*;uh=0FV`LFD25I+?EAKZTW5&nLg?q5}~ zmil4v|DrwmMJEy{P>bLk0>IS$20|@j-eFBu<@5VKMr@-%+y(>R_KVt!k@V_j^ z|9WBns!ha#55fChH-G5C?oOijOL2hm{M{h(4174~|Nq7MyS2C@^M2I}P}{m2M4mOG z50=e8Bl7`O!XK%;U%dJ&l_z5VOH@9f9(PA2KxRMc^^4R8YmacJ*Z(+{0QiLa zC8YZagbGhKiQj=ftju=D;+?wOFYMfRdKI3gx9uOlX7L{ssqP>FQvK1--ZxFx{CrZs zgZ!Qve(0|K8Q5Q4UDDr!{g1;4Kt<_pT=U;N5b@6Z|4(=YRG99@HNV+yHuKkO`DX#* z4*6d?z57*{`zNpp&yW9(fQRJ5f2Ur60?*yJ=0B8qm=yk9EO(#z`_V>)r>g8f!Tl!z z-Q6y}kA43vP~mCx>~FFEiIDFA?^hWB<&3*g;b|iGJHUT~b-$YMSFG({VLiBt@qY$O rJRpC8<|mLx|51y>ydWI@B6YSRz?&6#E+_+oF%X6VXGyd`1N#gBRAZq; diff --git a/python_blog_scraper.zip b/python_blog_scraper.zip deleted file mode 100644 index ab98b3d49955b5f587ee97c42aab87b81748600d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2784 zcmbVO2{e@J8=o=uQDliq8L~5rLAEgVWo+pVrHL#vvJS(D5g}Ak9duQ4Ye=$Xiir^A zPRdLOSt9#RDoeyIxikLxb2>Ta-ury#eV^|;-#O3o{GNCD{T$FBPEi2cTq=i-tiAl0 zL;(;$P(XfheuNxk zg`dBYANUb?xwdxjyAiA`(foycE*=rQIw14~(mAlye4-O_?E1Ee)azvM+(qjo_r$`R z84@>-0QvTl01jv#-jHUS_-Ga;Hm(iC+e$VNhXMhBeOn+lT|Tnz+NA2?^;)^rsbnkrl)D zDWyBVhvszk_pO{?=u=VPJDf3~j5u3OE$Rc7wJ* zy|TjxLaz32SzVO1rL2l%O68MEYSYMrx57WyntYJT0`rt2DibeCEL$D*9e>hsHhFZe zUDzPak1DUXhz)!mc3f<9EYm8?kloIrD!Z;>Rx8;}+i2%NX4n6>q%G^O>=x}L;(|#& zL|gzaFf;@f8XmgV)Tq~FJg0a(p6!2Xs)}NLbe5P*qzfOo>}b9m&J*b#-Ly`k0q|S9e?vkX`z#gtXtR zSjDjm+$bG&Z(}D!3jzR&Ta^Cegq^~9{5W8+poq2dVykZ2wdF_nnUs-W?E) z{z8Qa2&4k+t=@PEc6}c3>cgv|U5@Q>;f#f#X^ohjm{ea6fv=zRo>C^b>4zWTqwDRoj4vOj@{t3=4v+V3hBP*fb{bz77SxHr3Rf6+*v+Y&*rMYV7jeYtrN3Vrxuo8e#) z_dpD4dg7FUcR@C{D1u(=>s2Q@rEe&aooiD&I2qdg5lZO1h!I#F(#v6_v3jQ@BCCS1 zFnA0q=9S@JD$iODPK>FFWL6HWUhn@_b7cSO9s`qJH-+-OQ!~bCx2PUSolDJNZIZUo zpy0DeDpAde2&N95Eci-TIs0I ze`as&CR~jaH>>K3Ysoxqlw<0s){}~~Z_S?3f4hKJRG~D=r6_!eLT4E_fGYu`fhFfM zALK}$RE+9>cY6V|AWc<~=<4pi26B`|9`T{+c_q&)L#rV@?paMsScIdQT46Bafp9J+ z!6^UH+r2*GC7fa!E=SMGCp;9cSEmcpR537 zNU^!YDAbcI2*7|1gD&a9Q69kWB7Bu1)0QPu>60febtAB^`Vf7}xXH_R+iQ&8n7A$} z$lX2TrUUd7lI2L(2VY=<_r=97C%&=O#GVS$nWi7J23GF4YA)RAeU9OBAd#^(MU^qy=kS%!q!TK4<1;GEoxbfL5KmKFOX2r+SGY}f#g)h5 zDJg$yc@tj4Yy8ZJ87c3$8e5HPLam95AV%HtyAXTvo*Q&m!%3ez89pz}og>Hna*b2_ z(|CIg1>kwWTm@fkjjIy=Z8iI2vTFm?&0m?1(BHo2kiV%UNi_GB5;dWJx{;Y8SZ8K* zKtCf2qkal>x<_OJ$pu?+Qa~9}F20fW@c~}aOu-IV88PjJLcZ>usTN&Qo;>nVCUMk) z%0V}n|Cp1WVW*P`Xqg*-SJgg5u5slw8R7_w(xRhA4jWUHKbfv3Px$plMkbc`(evhb zl`mJjW2y8>QuxCCfwZ2tl7D00?qL-p;8WGNHVFz57ip}Vh=PfR%5)i{VPbUjfJ zbb?LYx3Z|7NIs#H+V;v&T(`9b0h$zkuVcx8?`S|BM(@hw??3M<6`t!x!<3p=#1Rj+ z1~!^{uW**`-ZN-9nZHBev|PbGIliP5(RRXJNoigMD^oQGmwDJXs;8?~XAdU;z`i#f z&_E7R(AEltb?L6ziV@rWH@)(|4C{(zGqbZ5%ev8G9kh!Wb@$W^ Date: Fri, 5 Jul 2024 21:59:29 +0530 Subject: [PATCH 6/6] Delete SJEC_session1_CS106_Prajwal directory --- SJEC_session1_CS106_Prajwal/Dockerfile | 12 -- .../docker-compose.yml | 9 -- .../python_web_scrape.py | 121 ------------------ SJEC_session1_CS106_Prajwal/requirements.txt | 4 - 4 files changed, 146 deletions(-) delete mode 100644 SJEC_session1_CS106_Prajwal/Dockerfile delete mode 100644 SJEC_session1_CS106_Prajwal/docker-compose.yml delete mode 100644 SJEC_session1_CS106_Prajwal/python_web_scrape.py delete mode 100644 SJEC_session1_CS106_Prajwal/requirements.txt diff --git a/SJEC_session1_CS106_Prajwal/Dockerfile b/SJEC_session1_CS106_Prajwal/Dockerfile deleted file mode 100644 index 7b1479e..0000000 --- a/SJEC_session1_CS106_Prajwal/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM python:3.10.2-alpine3.15 -# Create directories -RUN mkdir -p /root/workspace/src -COPY ./python_web_scrape.py /root/workspace/src -# Switch to project directory -WORKDIR /root/workspace/src -# Install required packages -RUN pip install --upgrade pip -RUN pip install requests bs4 html5lib -RUN pip install psycopg2-binary -CMD ["python_web_scrape.py"] -ENTRYPOINT ["python"] \ No newline at end of file diff --git a/SJEC_session1_CS106_Prajwal/docker-compose.yml b/SJEC_session1_CS106_Prajwal/docker-compose.yml deleted file mode 100644 index 0876790..0000000 --- a/SJEC_session1_CS106_Prajwal/docker-compose.yml +++ /dev/null @@ -1,9 +0,0 @@ -psql-db: - image: 'postgres:14' - container_name: psql-db - environment: - - PGPASSWORD=123456 - - POSTGRES_USER=postgres - - POSTGRES_PASSWORD=123456 - ports: - - '5434:5432' \ No newline at end of file diff --git a/SJEC_session1_CS106_Prajwal/python_web_scrape.py b/SJEC_session1_CS106_Prajwal/python_web_scrape.py deleted file mode 100644 index 5316336..0000000 --- a/SJEC_session1_CS106_Prajwal/python_web_scrape.py +++ /dev/null @@ -1,121 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import re -import psycopg2 -from psycopg2 import Error - -url = 'https://blog.python.org/' - -def create_connection(db_name, db_user, db_password, db_host, db_port): - try: - connection = psycopg2.connect( - database=db_name, - user=db_user, - password=db_password, - host=db_host, - port=db_port - ) - print("Connection to PostgreSQL DB successful") - return connection - except Error as e: - print(f"The error '{e}' occurred") - return None - -def execute_query(connection, data): - cursor = connection.cursor() - try: - query = """ - INSERT INTO python_blog_articles (date, title, body, author) - VALUES (%s, %s, %s, %s) - """ - cursor.execute(query, data) - connection.commit() - print("Query executed successfully") - except Error as e: - print(f"The error '{e}' occurred") - -def create_table(connection): - cursor = connection.cursor() - try: - create_table_query = """ - CREATE TABLE IF NOT EXISTS python_blog_articles ( - id SERIAL PRIMARY KEY, - date VARCHAR(100), - title TEXT, - body TEXT, - author VARCHAR(100) - ); - """ - cursor.execute(create_table_query) - connection.commit() - print("Table created successfully or already exists") - except Error as e: - print(f"The error '{e}' occurred") - -def process_page(soup, date, titletext, bodytext, author): - for div in soup.find_all('div', class_='date-outer'): - date_header = div.find('h2', class_='date-header') - if date_header: - date_text = date_header.find('span').get_text(strip=True) - date.append(date_text) - - for post in div.find_all('div', class_='post-outer'): - title_head = post.find('h3', class_='post-title entry-title') - if title_head: - titletext.append(title_head.text.strip()) - - content_div = post.find('div', class_='post-body entry-content') - if content_div: - paragraph_text = ' '.join([p.text.strip() for p in content_div.find_all('p')]) - bodytext.append(paragraph_text) - - footer_head = post.find('div', class_='post-footer') - if footer_head: - footer_text = footer_head.find('span', class_='post-author vcard').text.strip() - author.append(footer_text) - -def main(): - db_name = 'webdemo' - db_user = 'postgres' - db_password = '123456' - db_host = 'localhost' - db_port = '5434' - - connection = create_connection(db_name, db_user, db_password, db_host, db_port) - - if connection: - try: - date = [] - titletext = [] - bodytext = [] - author = [] - - res = requests.get(url) - soup = BeautifulSoup(res.content, 'html5lib') - process_page(soup, date, titletext, bodytext, author) - - while len(titletext) < 50: - older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE)) - if older_posts_link: - next_page_url = older_posts_link['href'] - res = requests.get(next_page_url) - soup = BeautifulSoup(res.content, 'html5lib') - process_page(soup, date, titletext, bodytext, author) - else: - break - - create_table(connection) - for i in range(len(titletext)): - data = (date[i], titletext[i], bodytext[i], author[i]) - execute_query(connection, data) - - except Error as e: - print(f"Error: {e}") - - finally: - if connection: - connection.close() - print("PostgreSQL connection is closed") - -if __name__ == "__main__": - main() diff --git a/SJEC_session1_CS106_Prajwal/requirements.txt b/SJEC_session1_CS106_Prajwal/requirements.txt deleted file mode 100644 index 02a89c4..0000000 --- a/SJEC_session1_CS106_Prajwal/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -psycopg2-binary==2.9.3 -beautifulsoup4==4.11.1 -requests==2.27.1 -html5lib==1.1