-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_email.py
More file actions
73 lines (58 loc) · 1.97 KB
/
parse_email.py
File metadata and controls
73 lines (58 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from email import message, parser, policy, contentmanager, header
from pathlib import Path
from html.parser import HTMLParser
import html as html_lib
class StripHTML(HTMLParser):
"""
deprecated
Strip HTML tags into a single string
"""
def __init__(self):
super().__init__()
self.text = []
def handle_data(self, data):
self.text.append(data)
def get_text(self):
return "".join(self.text).strip()
def strip_html(html):
"""
Deprecated
function to return stripped html
"""
stripper = StripHTML()
stripper.feed(html)
return stripper.get_text()
def parse_email(file):
with open(file, "rb") as f:
eml_data = parser.BytesParser(policy=policy.default).parse(f)
get_plain_body = eml_data.get_body(preferencelist=("plain", ))
get_html_body = eml_data.get_body(preferencelist=("html", ))
html_body = get_html_body.get_content() if get_html_body else None
plain_body = get_plain_body.get_content() if get_plain_body else None
if not html_body and plain_body:
html_body = f"<pre>{html_lib.escape(plain_body)}</pre>"
if not html_body:
html_body = "<p><em>No body exists.</em></p>"
attachments = []
for attachment in eml_data.walk():
filename = attachment.get_filename()
if filename is None:
continue
payload = attachment.get_payload(decode=True)
filesize = len(payload) if payload else 0
attachments.append((filename, filesize))
headers = [
(name, value)
for name, value in eml_data.items()
if name not in ["Subject", "From", "To", "Date"]
]
return {
"subject": str(eml_data["Subject"] or ""),
"from": str(eml_data["From"] or ""),
"to": str(eml_data["To"] or ""),
"date": str(eml_data["Date"] or ""),
"body_html": html_body,
"body_plain": plain_body or "",
"attachments": attachments,
"other_headers": headers,
}