From 83cc12bf01fd6e47b9c502ab4e254f75c1ba0364 Mon Sep 17 00:00:00 2001 From: elisagao122 <251384093@qq.com> Date: Mon, 20 Jun 2016 21:13:53 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E4=BD=9C?= =?UTF-8?q?=E4=B8=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit by xiaoyan --- class1/week1/homework/elisagao/homework1.py | 74 +++++++++++++++++++++ class1/week1/homework/elisagao/output | 20 ++++++ 2 files changed, 94 insertions(+) create mode 100644 class1/week1/homework/elisagao/homework1.py create mode 100644 class1/week1/homework/elisagao/output diff --git a/class1/week1/homework/elisagao/homework1.py b/class1/week1/homework/elisagao/homework1.py new file mode 100644 index 0000000..cd42247 --- /dev/null +++ b/class1/week1/homework/elisagao/homework1.py @@ -0,0 +1,74 @@ +# encoding=utf-8 +import jieba +import re + +#get content from file +def get_file_content(file_name): + fr = open(file_name, 'r') + return fr +####写文件 +def write_file(file_name, content): + fw = open(file_name, 'w',encoding = 'utf-8') + fw.writelines(content) + fw.close() + +file_name = 'emailsignature.txt' +text = get_file_content(file_name) +email_lines = 0 +output_content = "" + +for line in text: + #person name + if email_lines == 0: + print (line) + output_content += line + email_lines += 1 + continue + if line.find("‐‐") == 0: + email_lines = 0 + continue + #company + company_names = ["产业联合会", "技术社区", "律师事务所", "学校", "University"] + flag_company_name = False + for company_name in company_names: + if line.find(company_name) != -1: + print (line) + output_content += line + flag_company_name = True + break + if flag_company_name: + continue + #phone number + phone_words = ["Tel", "Mobile", "手机", "电话号码", "邮箱", "E‐mail", "mail", "地址", "+86"] + flag_phone_word = False + for phone_word in phone_words: + if line.find(phone_word) == 0: + print (line) + output_content += line + flag_phone_word = True + continue + if flag_phone_word: + continue + #email + email_pattern = '\w*@\w*.\w*.\w*' + if re.match(email_pattern, line): + #print (re.match(email_pattern, line)) + print (line) + output_content += line + continue + + text = line + seg_list = list(jieba.cut(line, cut_all=False)) + article_content = " ".join(list(seg_list)) + + #address + if article_content.find("中关村") != -1 and article_content.find("街") != -1 or article_content.find("Street") != -1: + print (text) + output_content += line + #print ("done") + continue + #output_content += line +write_file("output", output_content) + + + diff --git a/class1/week1/homework/elisagao/output b/class1/week1/homework/elisagao/output new file mode 100644 index 0000000..b113096 --- /dev/null +++ b/class1/week1/homework/elisagao/output @@ -0,0 +1,20 @@ +刘三 Liu, San ++86 15912348765 +sfghsdfg@abc.org.cn +李四 +北清大数据产业联合会 +邮箱:lisi@beiqingdata.com +地址:北京市海淀区北清大学东楼201室 +John Smith +University of Mannheim, Germany +Tel: +49 621 123 4567 +王五 +CSDN‐全球最大中文IT技术社区(www.csdn.net) +手机:13934567890 +E‐mail:gdagsdfs@csdn.net +地址:北京市朝阳区广顺北大街33号院一号楼福码大厦B座12层 +张三 +北京市张三律师事务所|Beijing Zhangsan Law Firm +北京市海淀区中关村有条街1号,邮编:100080 +No. 1 Youtiao Street , ZhongGuanCun West, Haidian District, Beijing 100080 +Mobile: 15023345465|Email: dfgasedt@126.com \ No newline at end of file From f921742fb677d7641450878c88b760fc7a265def Mon Sep 17 00:00:00 2001 From: elisagao122 <251384093@qq.com> Date: Fri, 24 Jun 2016 19:53:30 +0800 Subject: [PATCH 2/2] test --- class1/week1/homework/elisagao/homework1.py | 74 --------------------- class1/week1/homework/elisagao/output | 20 ------ 2 files changed, 94 deletions(-) delete mode 100644 class1/week1/homework/elisagao/homework1.py delete mode 100644 class1/week1/homework/elisagao/output diff --git a/class1/week1/homework/elisagao/homework1.py b/class1/week1/homework/elisagao/homework1.py deleted file mode 100644 index cd42247..0000000 --- a/class1/week1/homework/elisagao/homework1.py +++ /dev/null @@ -1,74 +0,0 @@ -# encoding=utf-8 -import jieba -import re - -#get content from file -def get_file_content(file_name): - fr = open(file_name, 'r') - return fr -####写文件 -def write_file(file_name, content): - fw = open(file_name, 'w',encoding = 'utf-8') - fw.writelines(content) - fw.close() - -file_name = 'emailsignature.txt' -text = get_file_content(file_name) -email_lines = 0 -output_content = "" - -for line in text: - #person name - if email_lines == 0: - print (line) - output_content += line - email_lines += 1 - continue - if line.find("‐‐") == 0: - email_lines = 0 - continue - #company - company_names = ["产业联合会", "技术社区", "律师事务所", "学校", "University"] - flag_company_name = False - for company_name in company_names: - if line.find(company_name) != -1: - print (line) - output_content += line - flag_company_name = True - break - if flag_company_name: - continue - #phone number - phone_words = ["Tel", "Mobile", "手机", "电话号码", "邮箱", "E‐mail", "mail", "地址", "+86"] - flag_phone_word = False - for phone_word in phone_words: - if line.find(phone_word) == 0: - print (line) - output_content += line - flag_phone_word = True - continue - if flag_phone_word: - continue - #email - email_pattern = '\w*@\w*.\w*.\w*' - if re.match(email_pattern, line): - #print (re.match(email_pattern, line)) - print (line) - output_content += line - continue - - text = line - seg_list = list(jieba.cut(line, cut_all=False)) - article_content = " ".join(list(seg_list)) - - #address - if article_content.find("中关村") != -1 and article_content.find("街") != -1 or article_content.find("Street") != -1: - print (text) - output_content += line - #print ("done") - continue - #output_content += line -write_file("output", output_content) - - - diff --git a/class1/week1/homework/elisagao/output b/class1/week1/homework/elisagao/output deleted file mode 100644 index b113096..0000000 --- a/class1/week1/homework/elisagao/output +++ /dev/null @@ -1,20 +0,0 @@ -刘三 Liu, San -+86 15912348765 -sfghsdfg@abc.org.cn -李四 -北清大数据产业联合会 -邮箱:lisi@beiqingdata.com -地址:北京市海淀区北清大学东楼201室 -John Smith -University of Mannheim, Germany -Tel: +49 621 123 4567 -王五 -CSDN‐全球最大中文IT技术社区(www.csdn.net) -手机:13934567890 -E‐mail:gdagsdfs@csdn.net -地址:北京市朝阳区广顺北大街33号院一号楼福码大厦B座12层 -张三 -北京市张三律师事务所|Beijing Zhangsan Law Firm -北京市海淀区中关村有条街1号,邮编:100080 -No. 1 Youtiao Street , ZhongGuanCun West, Haidian District, Beijing 100080 -Mobile: 15023345465|Email: dfgasedt@126.com \ No newline at end of file