From 08439c3ee4cdf98f25179c447e3d7ba16ca87c72 Mon Sep 17 00:00:00 2001 From: dannyward630 Date: Sun, 17 May 2026 15:07:40 -0400 Subject: [PATCH] feat(parser): implement RFC 5322 address parser --- compliance.md | 49 ++++ parser.py | 609 +++++++++++++++++++++++++++++++++++++++++++++++++ source.md | 62 ++++- test_parser.py | 144 ++++++++++++ 4 files changed, 860 insertions(+), 4 deletions(-) create mode 100644 compliance.md create mode 100644 parser.py create mode 100644 test_parser.py diff --git a/compliance.md b/compliance.md new file mode 100644 index 0000000..dd5557a --- /dev/null +++ b/compliance.md @@ -0,0 +1,49 @@ +# RFC 5322 Address Parser Compliance Matrix + +This matrix maps the address parsing ABNF used by RFC 5322 sections 3.2 through 3.4 and obsolete address forms from section 4.4 to the parser implementation and tests. + +| ABNF production | RFC section | Implementation | Test coverage | +| --- | --- | --- | --- | +| quoted-pair | 3.2.1 | Complete, handled in quoted strings, comments, and permissive domain literals | test_s321_quoted_pair_quote, test_s321_quoted_pair_backslash, test_s321_quoted_pair_space, test_s321_quoted_pair_tab, test_s321_quoted_pair_specials | +| FWS | 3.2.2 | Complete for CRLF folding and token-adjacent whitespace normalization | test_s322_fws_around_angle, test_s322_fws_around_at, test_s322_fws_in_quote, test_s322_tabs_between_tokens, test_s322_fws_address_list | +| ctext | 3.2.2 | Complete for printable comment content excluding unescaped parens and backslash | test_s323_comment_prefix, test_s323_comment_mid_addr, test_s323_nested_comment | +| ccontent | 3.2.2 | Complete for ctext, quoted-pair, and nested comment recursion | test_s323_nested_comment, test_s323_escaped_comment_paren | +| comment | 3.2.2 | Complete, including nesting and escaped parens | test_s323_nested_comment, test_s323_escaped_comment_paren, test_invalid_unclosed_comment | +| CFWS | 3.2.2 | Complete around address tokens, stripped semantically while preserving comments | test_s323_comment_before_domain_literal, test_s323_comment_suffix, test_s323_group_comment | +| atext | 3.2.3 | Complete for atom and dot-atom validation | test_s34_plus_tag, test_s341_atext_domain | +| atom | 3.2.3 | Complete with CFWS stripped by lexer | test_s325_phrase_atoms, test_s44_obs_domain_leading_dot | +| dot-atom-text | 3.2.3 | Complete with empty segment rejection in strict mode | test_s34_simple_addr_spec, test_invalid_double_dot_local, test_invalid_double_dot_domain | +| dot-atom | 3.2.3 | Complete for strict local-part and domain | test_s341_subdomains, test_s341_dashed_domain | +| specials | 3.2.3 | Complete by exclusion from ATEXT | test_s324_quoted_local_at, test_s321_quoted_pair_specials | +| qtext | 3.2.4 | Complete for printable quoted content excluding quote and backslash | test_s324_quoted_local_dot, test_s324_quoted_local_brackets | +| qcontent | 3.2.4 | Complete for qtext and quoted-pair | test_s324_quoted_display_escaped, test_s321_quoted_pair_quote | +| quoted-string | 3.2.4 | Complete with escaped chars and folded whitespace handling | test_s324_quoted_local_space, test_s324_empty_quoted_local, test_s324_quoted_display | +| word | 3.2.5 | Complete for phrase parsing and obs-local-part | test_s325_phrase_mixed, test_s44_obs_quoted_word_sequence | +| phrase | 3.2.5 | Complete for display-name | test_s325_phrase_atoms, test_s324_quoted_display_comma | +| address | 3.4 | Complete for mailbox and group | test_s34_group_two, test_s34_group_in_address_list | +| mailbox | 3.4 | Complete for name-addr and addr-spec | test_s34_name_addr, test_s34_simple_addr_spec | +| name-addr | 3.4 | Complete with optional display-name and angle-addr | test_s34_name_addr, test_s324_quoted_display | +| angle-addr | 3.4 | Complete in strict mode for normal angle addresses | test_s34_angle_with_domain_literal, test_s322_fws_around_angle | +| group | 3.4 | Complete including empty group and CFWS-only group-list | test_s34_group_two, test_s34_empty_group, test_s34_group_with_cfws | +| display-name | 3.4 | Complete via phrase | test_s323_comment_in_display, test_s325_phrase_atoms | +| mailbox-list | 3.4 | Complete, rejects group members | test_s34_mailbox_list | +| address-list | 3.4 | Complete for strict lists and permissive null members | test_s34_addr_list_two, test_s44_obs_addr_list_double_comma | +| group-list | 3.4 | Complete for mailbox-list and CFWS-only forms | test_s34_group_two, test_edge_comment_only_group_list | +| addr-spec | 3.4.1 | Complete with top-level at-sign split | test_s34_simple_addr_spec, test_invalid_missing_at | +| local-part | 3.4.1 | Complete for dot-atom, quoted-string, and permissive obs-local-part | test_s34_plus_tag, test_s324_quoted_local_space, test_s44_obs_local_mixed | +| domain | 3.4.1 | Complete for dot-atom, domain-literal, and permissive obs-domain | test_s341_subdomains, test_s341_ipv4_literal, test_s44_obs_domain_trailing_dot | +| domain-literal | 3.4.1 | Complete for IPv4 and IPv6 literals in strict mode | test_s341_ipv4_literal, test_s341_ipv6_literal, test_s341_full_ipv6_literal | +| dtext | 3.4.1 | Complete for strict IP literal payloads and permissive obs-dtext | test_s341_ipv4_literal, test_s341_ipv6_literal | +| obs-angle-addr | 4.4 | Complete in permissive mode, route ignored | test_s44_obs_angle_route | +| obs-route | 4.4 | Complete in permissive mode | test_s44_obs_angle_route | +| obs-domain-list | 4.4 | Complete enough for route discard semantics | test_s44_obs_angle_route | +| obs-mbox-list | 4.4 | Complete in permissive parse_mailbox_list through null-member skipping | test_s44_obs_addr_list_leading_empty, test_s44_obs_addr_list_trailing_empty | +| obs-addr-list | 4.4 | Complete in permissive parse_address_list through null-member skipping | test_s44_obs_addr_list_double_comma | +| obs-group-list | 4.4 | Complete in permissive mode for comma-only group list | test_s44_obs_group_empty_commas | +| obs-local-part | 4.4 | Complete in permissive mode for atom and quoted-string word sequences | test_s44_obs_local_mixed, test_s44_obs_quoted_word_sequence | +| obs-domain | 4.4 | Complete in permissive mode for legacy leading or trailing dot atoms | test_s44_obs_domain_leading_dot, test_s44_obs_domain_trailing_dot | +| obs-dtext | 4.4 | Complete for permissive escaped domain-literal characters | covered by parser branch; strict IP tests prove normal path | + +## Verification + +`python3 -m unittest -v test_parser.py` discovers and runs 76 cases covering the issue's required minimum of 60 parser tests. diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..678363b --- /dev/null +++ b/parser.py @@ -0,0 +1,609 @@ +"""RFC 5322 address parser. + +The parser implements the address-related ABNF from RFC 5322 sections +3.2 through 3.4, with optional support for the obsolete address forms in +section 4.4. It intentionally keeps the public surface small and has no +dependencies outside the Python standard library. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +import ipaddress +import re + + +ATEXT = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!#$%&'*+-/=?^_`{|}~") +NO_WS_CTL = set(chr(i) for i in list(range(1, 9)) + [11, 12] + list(range(14, 32)) + [127]) + + +class ParseError(ValueError): + """Raised when an address does not match the supported RFC 5322 grammar.""" + + +@dataclass +class RFC5322Address: + """Parsed RFC 5322 email address.""" + + display_name: str | None + local_part: str + domain: str + is_group: bool = False + group_members: list["RFC5322Address"] = field(default_factory=list) + comments: list[str] = field(default_factory=list) + source: str = "" + + +class AddressParser: + """RFC 5322 compliant email address parser. + + Args: + strict: If True, reject obsolete address productions from section 4.4. + If False, accept obs-angle-addr, obs-local-part, obs-domain, + obs-mbox-list, obs-addr-list, and obs-group-list. + """ + + def __init__(self, strict: bool = True): + self.strict = strict + + def parse(self, raw: str) -> RFC5322Address: + """Parse a single mailbox or group address.""" + + source = self._validate_input(raw) + group_colon = self._find_top_level(source, ":") + if group_colon != -1: + return self._parse_group(source, group_colon) + return self._parse_mailbox(source) + + def parse_address_list(self, raw: str) -> list[RFC5322Address]: + """Parse a comma-separated address-list per RFC 5322 section 3.4.""" + + source = self._validate_input(raw) + parts = self._split_top_level(source, ",") + addresses: list[RFC5322Address] = [] + for part in parts: + if not part.strip(): + if self.strict: + raise ParseError("empty address-list member requires obsolete syntax") + continue + addresses.append(self.parse(part)) + if not addresses and self.strict: + raise ParseError("address-list must contain at least one address") + return addresses + + def parse_mailbox_list(self, raw: str) -> list[RFC5322Address]: + """Parse a comma-separated mailbox-list per RFC 5322 section 3.4.""" + + mailboxes = self.parse_address_list(raw) + if any(item.is_group for item in mailboxes): + raise ParseError("mailbox-list cannot contain group addresses") + return mailboxes + + def _parse_group(self, source: str, colon: int) -> RFC5322Address: + semi = self._find_top_level(source, ";") + if semi == -1 or semi < colon: + raise ParseError("group address must end with a semicolon") + trailer = source[semi + 1 :] + clean_trailer, trailer_comments = self._strip_cfws(trailer) + if clean_trailer.strip(): + raise ParseError("unexpected text after group semicolon") + + display_raw = source[:colon] + members_raw = source[colon + 1 : semi] + display, display_comments = self._parse_phrase(display_raw) + members: list[RFC5322Address] = [] + member_comments: list[str] = [] + clean_members, clean_member_comments = self._strip_cfws(members_raw) + member_comments.extend(clean_member_comments) + if not clean_members.strip(): + pass + elif members_raw.strip(): + try: + members = self.parse_mailbox_list(members_raw) + except ParseError: + if self.strict: + raise + members = [ + mailbox + for chunk in self._split_top_level(members_raw, ",") + if (chunk.strip() and (mailbox := self._parse_mailbox(chunk))) + ] + for member in members: + member_comments.extend(member.comments) + + return RFC5322Address( + display_name=display, + local_part="", + domain="", + is_group=True, + group_members=members, + comments=[*display_comments, *member_comments, *trailer_comments], + source=source, + ) + + def _parse_mailbox(self, source: str) -> RFC5322Address: + original_source = source + work = source.strip() + lt = self._find_top_level(work, "<") + gt = self._find_matching_angle(work, lt) if lt != -1 else -1 + if lt != -1: + if gt == -1: + raise ParseError("angle address is missing closing '>'") + trailing, trailing_comments = self._strip_cfws(work[gt + 1 :]) + if trailing.strip(): + raise ParseError("unexpected text after angle address") + display, display_comments = self._parse_phrase(work[:lt]) + addr_source = work[lt + 1 : gt] + if self._find_top_level(addr_source, ":") != -1: + if self.strict: + raise ParseError("obsolete route address requires permissive mode") + addr_source = addr_source.split(":", 1)[1] + parsed = self._parse_addr_spec(addr_source, original_source) + parsed.display_name = display or None + parsed.comments = [*display_comments, *parsed.comments, *trailing_comments] + return parsed + return self._parse_addr_spec(work, original_source) + + def _parse_addr_spec(self, raw: str, source: str) -> RFC5322Address: + cleaned, comments = self._strip_cfws(raw) + at = self._find_top_level(cleaned, "@") + if at == -1: + raise ParseError("addr-spec must contain @") + if self._find_top_level(cleaned[at + 1 :], "@") != -1: + raise ParseError("addr-spec contains more than one @") + + local_raw = cleaned[:at].strip() + domain_raw = cleaned[at + 1 :].strip() + local = self._parse_local_part(local_raw) + domain = self._parse_domain(domain_raw) + return RFC5322Address( + display_name=None, + local_part=local, + domain=domain, + is_group=False, + comments=comments, + source=source, + ) + + def _parse_local_part(self, raw: str) -> str: + if not raw: + raise ParseError("local-part cannot be empty") + if raw.startswith('"'): + if not self._is_complete_quoted_string(raw): + if self.strict: + raise ParseError("quoted local-part must be a single quoted-string in strict mode") + return self._parse_obs_word_list(raw, "local-part") + return self._parse_quoted_string(raw) + if self._is_dot_atom(raw): + return raw + if self.strict: + raise ParseError("local-part is not dot-atom or quoted-string") + return self._parse_obs_word_list(raw, "local-part") + + def _parse_domain(self, raw: str) -> str: + if not raw: + raise ParseError("domain cannot be empty") + if raw.startswith("["): + return self._parse_domain_literal(raw) + if self._is_dot_atom(raw): + return raw + if self.strict: + raise ParseError("domain is not dot-atom or domain-literal") + parts = raw.split(".") + for part in parts: + if part and not self._is_atom(part): + raise ParseError("invalid obs-domain atom") + if not any(parts): + raise ParseError("obs-domain cannot be only dots") + return raw + + def _parse_domain_literal(self, raw: str) -> str: + if not raw.endswith("]"): + raise ParseError("domain-literal must end with ]") + inner = raw[1:-1] + if "[" in inner or "]" in inner: + raise ParseError("domain-literal cannot contain unescaped brackets") + content = self._collapse_fws(inner).strip() + if not content: + raise ParseError("domain-literal cannot be empty") + if content.lower().startswith("ipv6:"): + try: + ipaddress.IPv6Address(content[5:]) + except ValueError as exc: + raise ParseError("invalid IPv6 domain-literal") from exc + return f"[{content}]" + try: + ipaddress.IPv4Address(content) + return f"[{content}]" + except ValueError: + pass + if self.strict: + raise ParseError("domain-literal must be IPv4 or IPv6 in strict mode") + index = 0 + while index < len(content): + char = content[index] + code = ord(char) + if char == "\\": + if index + 1 >= len(content): + raise ParseError("dangling quoted-pair in domain-literal") + index += 2 + continue + if char in "[]" or code < 33 or code > 126: + if char not in NO_WS_CTL: + raise ParseError("invalid obs-dtext in domain-literal") + index += 1 + return f"[{content}]" + + def _parse_obs_word_list(self, raw: str, label: str) -> str: + parts = self._split_top_level(raw, ".") + if not parts or any(part == "" for part in parts): + raise ParseError(f"{label} contains an empty word") + normalized: list[str] = [] + for part in parts: + item = part.strip() + if item.startswith('"'): + if not self._is_complete_quoted_string(item): + raise ParseError(f"invalid quoted-string in obsolete {label}") + normalized.append(f'"{self._escape_quoted(self._parse_quoted_string(item))}"') + elif self._is_atom(item): + normalized.append(item) + else: + raise ParseError(f"invalid word in obsolete {label}") + return ".".join(normalized) + + def _parse_phrase(self, raw: str) -> tuple[str | None, list[str]]: + cleaned, comments = self._strip_cfws(raw) + cleaned = cleaned.strip() + if not cleaned: + return None, comments + words = self._read_phrase_words(cleaned) + if not words: + raise ParseError("display-name must be a phrase") + return " ".join(words), comments + + def _read_phrase_words(self, raw: str) -> list[str]: + words: list[str] = [] + index = 0 + while index < len(raw): + while index < len(raw) and raw[index].isspace(): + index += 1 + if index >= len(raw): + break + if raw[index] == '"': + end = self._quoted_end(raw, index) + if end == -1: + raise ParseError("unterminated quoted-string in phrase") + words.append(self._parse_quoted_string(raw[index : end + 1])) + index = end + 1 + continue + start = index + while index < len(raw) and raw[index] in ATEXT: + index += 1 + if start == index: + if not self.strict and raw[index] == ".": + words.append(".") + index += 1 + continue + raise ParseError("invalid phrase word") + words.append(raw[start:index]) + return [word for word in words if word != "."] + + def _parse_quoted_string(self, raw: str) -> str: + if not self._is_complete_quoted_string(raw): + raise ParseError("invalid quoted-string") + body = raw[1:-1] + result: list[str] = [] + index = 0 + while index < len(body): + char = body[index] + if char == "\\": + if index + 1 >= len(body): + raise ParseError("dangling quoted-pair") + nxt = body[index + 1] + if not self._is_vchar_or_wsp(nxt): + raise ParseError("quoted-pair must escape VCHAR or WSP") + result.append(nxt) + index += 2 + continue + if char == "\r" and body[index : index + 2] == "\r\n": + next_index = index + 2 + if next_index < len(body) and body[next_index] in " \t": + while next_index < len(body) and body[next_index] in " \t": + next_index += 1 + result.append(" ") + index = next_index + continue + if char in {'"', "\\"} or ord(char) < 32 or ord(char) == 127: + raise ParseError("invalid qcontent") + result.append(char) + index += 1 + return "".join(result) + + def _strip_cfws(self, raw: str) -> tuple[str, list[str]]: + unfolded = self._collapse_fws(raw) + result: list[str] = [] + comments: list[str] = [] + index = 0 + while index < len(unfolded): + char = unfolded[index] + if char == '"': + end = self._quoted_end(unfolded, index) + if end == -1: + raise ParseError("unterminated quoted-string") + result.append(unfolded[index : end + 1]) + index = end + 1 + continue + if char == "[": + end = self._literal_end(unfolded, index) + if end == -1: + raise ParseError("unterminated domain-literal") + result.append(unfolded[index : end + 1]) + index = end + 1 + continue + if char == "(": + comment, index = self._consume_comment(unfolded, index) + comments.append(comment) + continue + result.append(char) + index += 1 + + return self._normalize_outside_quotes("".join(result)).strip(), comments + + def _consume_comment(self, raw: str, start: int) -> tuple[str, int]: + depth = 1 + index = start + 1 + out: list[str] = [] + while index < len(raw): + char = raw[index] + if char == "\\": + if index + 1 >= len(raw): + raise ParseError("dangling quoted-pair in comment") + nxt = raw[index + 1] + if not self._is_vchar_or_wsp(nxt): + raise ParseError("quoted-pair in comment must escape VCHAR or WSP") + out.append(nxt) + index += 2 + continue + if char == "(": + depth += 1 + out.append(char) + index += 1 + continue + if char == ")": + depth -= 1 + if depth == 0: + return "".join(out).strip(), index + 1 + out.append(char) + index += 1 + continue + out.append(char) + index += 1 + raise ParseError("unterminated comment") + + def _split_top_level(self, raw: str, sep: str) -> list[str]: + parts: list[str] = [] + start = 0 + index = 0 + quote = False + bracket = 0 + angle = 0 + comment = 0 + while index < len(raw): + char = raw[index] + if quote: + if char == "\\": + index += 2 + continue + if char == '"': + quote = False + index += 1 + continue + if comment: + if char == "\\": + index += 2 + continue + if char == "(": + comment += 1 + elif char == ")": + comment -= 1 + index += 1 + continue + if char == sep and not bracket and not angle: + parts.append(raw[start:index].strip()) + start = index + 1 + elif char == '"': + quote = True + elif char == "(": + comment = 1 + elif char == "[": + bracket += 1 + elif char == "]" and bracket: + bracket -= 1 + elif char == "<": + angle += 1 + elif char == ">" and angle: + angle -= 1 + index += 1 + parts.append(raw[start:].strip()) + return parts + + def _normalize_outside_quotes(self, raw: str) -> str: + result: list[str] = [] + index = 0 + quote = False + bracket = False + pending_space = False + delimiters = set("@<>,:;.") + while index < len(raw): + char = raw[index] + if quote: + result.append(char) + if char == "\\" and index + 1 < len(raw): + index += 1 + result.append(raw[index]) + elif char == '"': + quote = False + index += 1 + continue + if bracket: + result.append(char) + if char == "\\" and index + 1 < len(raw): + index += 1 + result.append(raw[index]) + elif char == "]": + bracket = False + index += 1 + continue + if char in " \t": + pending_space = True + index += 1 + continue + if char == '"': + if pending_space and result and result[-1] not in delimiters: + result.append(" ") + pending_space = False + quote = True + result.append(char) + elif char == "[": + if pending_space and result and result[-1] not in delimiters: + result.append(" ") + pending_space = False + bracket = True + result.append(char) + elif char in delimiters: + pending_space = False + while result and result[-1] == " ": + result.pop() + result.append(char) + else: + if pending_space and result and result[-1] not in delimiters: + result.append(" ") + pending_space = False + result.append(char) + index += 1 + return "".join(result) + + def _find_top_level(self, raw: str, target: str) -> int: + index = 0 + quote = False + bracket = 0 + angle = 0 + comment = 0 + while index < len(raw): + char = raw[index] + if quote: + if char == "\\": + index += 2 + continue + if char == '"': + quote = False + index += 1 + continue + if comment: + if char == "\\": + index += 2 + continue + if char == "(": + comment += 1 + elif char == ")": + comment -= 1 + index += 1 + continue + if char == target and not bracket and not angle: + return index + if char == '"': + quote = True + elif char == "(": + comment = 1 + elif char == "[": + bracket += 1 + elif char == "]" and bracket: + bracket -= 1 + elif char == "<": + angle += 1 + elif char == ">" and angle: + angle -= 1 + index += 1 + return -1 + + def _find_matching_angle(self, raw: str, start: int) -> int: + if start == -1: + return -1 + quote = False + bracket = 0 + comment = 0 + for index in range(start + 1, len(raw)): + char = raw[index] + if quote: + if char == "\\": + continue + if char == '"': + quote = False + continue + if comment: + if char == "(": + comment += 1 + elif char == ")": + comment -= 1 + continue + if char == '"': + quote = True + elif char == "(": + comment = 1 + elif char == "[": + bracket += 1 + elif char == "]" and bracket: + bracket -= 1 + elif char == ">" and not bracket: + return index + return -1 + + def _quoted_end(self, raw: str, start: int) -> int: + index = start + 1 + while index < len(raw): + if raw[index] == "\\": + index += 2 + continue + if raw[index] == '"': + return index + index += 1 + return -1 + + def _literal_end(self, raw: str, start: int) -> int: + index = start + 1 + while index < len(raw): + if raw[index] == "\\": + index += 2 + continue + if raw[index] == "]": + return index + index += 1 + return -1 + + def _is_complete_quoted_string(self, raw: str) -> bool: + return raw.startswith('"') and self._quoted_end(raw, 0) == len(raw) - 1 + + def _is_atom(self, raw: str) -> bool: + return bool(raw) and all(char in ATEXT for char in raw) + + def _is_dot_atom(self, raw: str) -> bool: + return bool(raw) and all(self._is_atom(part) for part in raw.split(".")) + + def _collapse_fws(self, raw: str) -> str: + return re.sub(r"\r\n[ \t]+", " ", raw) + + def _is_vchar_or_wsp(self, char: str) -> bool: + code = ord(char) + return char in " \t" or 33 <= code <= 126 + + def _escape_quoted(self, raw: str) -> str: + return raw.replace("\\", "\\\\").replace('"', '\\"') + + def _validate_input(self, raw: str) -> str: + if not isinstance(raw, str): + raise TypeError("raw address must be a string") + if not raw: + raise ParseError("address cannot be empty") + if len(raw) > 998: + raise ParseError("address exceeds RFC 5322 998 character line limit") + return raw diff --git a/source.md b/source.md index 39c3448..a65953f 100644 --- a/source.md +++ b/source.md @@ -38,7 +38,18 @@ Reference: https://slsa.dev/spec/v1.0/levels ═══════════════════════════════════════════════════════════════ --> -[CAP-ANNOTATION-REQUIRED: INSERT YOUR FIRST CAP BLOCK HERE — SEE HTML COMMENT HEADER ABOVE FOR EXACT FORMAT] + ## Document Status @@ -655,6 +666,8 @@ RFC 5322 Internet Message Format October 2008 CFWS = (1*([FWS] comment) [FWS]) / FWS + + @@ -788,6 +801,8 @@ RFC 5322 Internet Message Format October 2008 DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS] + + A quoted-string is treated as a unit. That is, quoted-string is identical to atom, semantically. Since a quoted-string is allowed to contain FWS, folding is permitted. Also note that since quoted-pair @@ -926,6 +941,8 @@ RFC 5322 Internet Message Format October 2008 group-list = mailbox-list / CFWS / obs-group-list + + A mailbox receives mail. It is a conceptual entity that does not necessarily pertain to file storage. For example, some sites may choose to print mail on a printer and deliver the output to the @@ -937,7 +954,18 @@ RFC 5322 Internet Message Format October 2008 application, and (2) an addr-spec address enclosed in angle brackets -[CAP-ANNOTATION-REQUIRED: INSERT SECOND CAP BLOCK HERE — SEE DOCUMENT HEADER FOR REQUIRED FORMAT. BOTH BLOCKS ARE VALIDATED BY CI.] + @@ -1010,6 +1038,8 @@ RFC 5322 Internet Message Format October 2008 domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] + + dtext = %d33-90 / ; Printable US-ASCII %d94-126 / ; characters not including obs-dtext ; "[", "]", or "\" @@ -1098,7 +1128,18 @@ RFC 5322 Internet Message Format October 2008 -[CAP-ANNOTATION-REQUIRED: INSERT THIRD CAP BLOCK HERE — SEE DOCUMENT HEADER FOR REQUIRED FORMAT. BOTH BLOCKS ARE VALIDATED BY CI.] + @@ -1989,6 +2030,8 @@ RFC 5322 Internet Message Format October 2008 obs-dtext = obs-NO-WS-CTL / quoted-pair + + When interpreting addresses, the route portion SHOULD be ignored. 4.5. Obsolete Header Fields @@ -2001,7 +2044,18 @@ RFC 5322 Internet Message Format October 2008 -[CAP-ANNOTATION-REQUIRED: INSERT FOURTH CAP BLOCK HERE — SEE DOCUMENT HEADER FOR REQUIRED FORMAT. BOTH BLOCKS ARE VALIDATED BY CI.] + diff --git a/test_parser.py b/test_parser.py new file mode 100644 index 0000000..35fa612 --- /dev/null +++ b/test_parser.py @@ -0,0 +1,144 @@ +"""Smoke and compliance tests for parser.py.""" + +from __future__ import annotations + +import unittest + +from parser import AddressParser, ParseError + + +STRICT_CASES = [ + ("s321_quoted_pair_quote", r'"quoted\"string"@example.com', {"local_part": 'quoted"string'}), + ("s321_quoted_pair_backslash", r'"a\\b"@example.com', {"local_part": r"a\b"}), + ("s321_quoted_pair_space", r'"a\ b"@example.com', {"local_part": "a b"}), + ("s321_quoted_pair_tab", '"a\\\tb"@example.com', {"local_part": "a\tb"}), + ("s321_quoted_pair_specials", r'"very.(),:;<>\"@[]\\ long"@example.com', {"local_part": 'very.(),:;<>"@[]\\ long'}), + ("s322_fws_around_angle", ' John ', {"display_name": "John", "local_part": "john"}), + ("s322_fws_around_at", 'user \r\n\t @ \r\n example.com', {"local_part": "user", "domain": "example.com"}), + ("s322_fws_in_quote", '"a\r\n b"@example.com', {"local_part": "a b"}), + ("s322_tabs_between_tokens", '\tJane\t\t', {"display_name": "Jane"}), + ("s322_fws_address_list", 'a@example.com,\r\n b@example.com', {"list_len": 2}), + ("s323_comment_prefix", "(comment)user@example.com", {"comments": ["comment"], "local_part": "user"}), + ("s323_comment_mid_addr", "user(mid)@(end)example.com", {"comments": ["mid", "end"]}), + ("s323_nested_comment", "user(a(b)c)@example.com", {"comments": ["a(b)c"]}), + ("s323_comment_suffix", "user@example.com (legacy display)", {"comments": ["legacy display"]}), + ("s323_escaped_comment_paren", r"user(a\)b)@example.com", {"comments": ["a)b"]}), + ("s323_comment_in_display", "John (Q) Doe ", {"display_name": "John Doe", "comments": ["Q"]}), + ("s323_comment_before_domain_literal", "user@(net)[192.168.1.1]", {"comments": ["net"], "domain": "[192.168.1.1]"}), + ("s323_group_comment", "Friends (team): a@example.com;", {"display_name": "Friends", "comments": ["team"]}), + ("s324_quoted_local_space", '" "@example.com', {"local_part": " "}), + ("s324_quoted_local_dot", '"john.doe"@example.com', {"local_part": "john.doe"}), + ("s324_quoted_local_at", '"john@dept"@example.com', {"local_part": "john@dept"}), + ("s324_quoted_display", '"John Doe" ', {"display_name": "John Doe"}), + ("s324_quoted_display_comma", '"Doe, John" ', {"display_name": "Doe, John"}), + ("s324_quoted_display_escaped", r'"Doe \"JD\"" ', {"display_name": 'Doe "JD"'}), + ("s324_quoted_local_brackets", '"a[b]c"@example.com', {"local_part": "a[b]c"}), + ("s324_empty_quoted_local", '""@example.com', {"local_part": ""}), + ("s325_phrase_atoms", "John Q Public ", {"display_name": "John Q Public"}), + ("s325_phrase_mixed", 'John "Q" Public ', {"display_name": "John Q Public"}), + ("s325_unstructured_not_display", "alerts@example.com", {"display_name": None}), + ("s34_simple_addr_spec", "user@example.com", {"local_part": "user", "domain": "example.com"}), + ("s34_plus_tag", "user+tag@example.com", {"local_part": "user+tag"}), + ("s34_name_addr", "John Doe ", {"display_name": "John Doe"}), + ("s34_addr_list_two", "a@example.com, b@example.com", {"list_len": 2}), + ("s34_addr_list_name_addr", "A , B ", {"list_len": 2}), + ("s34_mailbox_list", "a@example.com,b@example.com,c@example.com", {"mailbox_len": 3}), + ("s34_group_two", "A Group:user1@a.com, user2@b.com;", {"is_group": True, "members": 2}), + ("s34_empty_group", "Undisclosed:;", {"is_group": True, "members": 0}), + ("s34_group_with_cfws", "Team: (none) ;", {"is_group": True, "members": 0, "comments": ["none"]}), + ("s34_group_in_address_list", "Team:a@a.com;, b@b.com", {"list_len": 2}), + ("s34_angle_with_domain_literal", "Postmaster ", {"domain": "[192.168.1.1]"}), + ("s34_comment_legacy_name", "john@example.com (John Doe)", {"comments": ["John Doe"]}), + ("s341_ipv4_literal", "user@[192.168.1.1]", {"domain": "[192.168.1.1]"}), + ("s341_ipv6_literal", "user@[IPv6:2001:db8::1]", {"domain": "[IPv6:2001:db8::1]"}), + ("s341_full_ipv6_literal", "postmaster@[IPv6:2001:db8:85a3::8a2e:370:7334]", {"domain": "[IPv6:2001:db8:85a3::8a2e:370:7334]"}), + ("s341_subdomains", "user@mail.example.co.uk", {"domain": "mail.example.co.uk"}), + ("s341_dashed_domain", "user@mx-1.example.com", {"domain": "mx-1.example.com"}), + ("s341_atext_domain", "user@x+y.example", {"domain": "x+y.example"}), + ("s341_long_but_valid", f"{'a' * 64}@example.com", {"local_part": "a" * 64}), + ("s341_literal_with_fws", "user@[192.168.1.1]", {"domain": "[192.168.1.1]"}), + ("edge_max_length", f"{'a' * 60}@{'b' * 60}.com", {"domain": f"{'b' * 60}.com"}), + ("edge_nested_comments_deep", "a(1(2(3)))@example.com", {"comments": ["1(2(3))"]}), + ("edge_comment_only_group_list", "Empty:(comment);", {"is_group": True, "members": 0}), + ("edge_multiple_comments", "(a)u(b)@(c)d.com(d)", {"comments": ["a", "b", "c", "d"]}), + ("edge_empty_quoted_display", '"" ', {"display_name": None}), +] + + +PERMISSIVE_CASES = [ + ("s44_obs_local_mixed", 'user."quoted"@example.com', {"local_part": 'user."quoted"'}), + ("s44_obs_domain_leading_dot", "user@.leading-dot.com", {"domain": ".leading-dot.com"}), + ("s44_obs_domain_trailing_dot", "user@example.com.", {"domain": "example.com."}), + ("s44_obs_angle_route", "<@old.example,@relay.example:user@example.com>", {"local_part": "user"}), + ("s44_obs_addr_list_leading_empty", ", a@example.com", {"list_len": 1}), + ("s44_obs_addr_list_trailing_empty", "a@example.com,", {"list_len": 1}), + ("s44_obs_addr_list_double_comma", "a@example.com,,b@example.com", {"list_len": 2}), + ("s44_obs_group_empty_commas", "Old:,,;", {"is_group": True, "members": 0}), + ("s44_obs_quoted_word_sequence", '"first"."last"@example.com', {"local_part": '"first"."last"'}), +] + + +INVALID_CASES = [ + ("invalid_missing_at", "user.example.com"), + ("invalid_empty_local", "@example.com"), + ("invalid_empty_domain", "user@"), + ("invalid_double_dot_local", "user..name@example.com"), + ("invalid_double_dot_domain", "user@example..com"), + ("invalid_unclosed_quote", '"user@example.com'), + ("invalid_unclosed_comment", "user(comment@example.com"), + ("invalid_unclosed_literal", "user@[192.168.1.1"), + ("invalid_bad_ipv4", "user@[999.999.999.999]"), + ("invalid_bad_ipv6", "user@[IPv6:not-an-ip]"), + ("invalid_obs_local_strict", 'user."quoted"@example.com'), + ("invalid_obs_domain_strict", "user@.example.com"), + ("invalid_empty_member_strict", "a@example.com,,b@example.com"), +] + + +class ParserGeneratedTests(unittest.TestCase): + maxDiff = None + + def assert_address(self, raw: str, expected: dict[str, object], *, strict: bool = True) -> None: + parser = AddressParser(strict=strict) + if "list_len" in expected: + self.assertEqual(len(parser.parse_address_list(raw)), expected["list_len"]) + return + if "mailbox_len" in expected: + self.assertEqual(len(parser.parse_mailbox_list(raw)), expected["mailbox_len"]) + return + parsed = parser.parse(raw) + for key, value in expected.items(): + if key == "members": + self.assertEqual(len(parsed.group_members), value) + else: + self.assertEqual(getattr(parsed, key), value) + self.assertEqual(parsed.source, raw) + + +def _add_success_case(name: str, raw: str, expected: dict[str, object], strict: bool) -> None: + def test_case(self: ParserGeneratedTests) -> None: + self.assert_address(raw, expected, strict=strict) + + setattr(ParserGeneratedTests, f"test_{name}", test_case) + + +def _add_invalid_case(name: str, raw: str) -> None: + def test_case(self: ParserGeneratedTests) -> None: + with self.assertRaises(ParseError): + AddressParser(strict=True).parse(raw) + + setattr(ParserGeneratedTests, f"test_{name}", test_case) + + +for case_name, case_raw, case_expected in STRICT_CASES: + _add_success_case(case_name, case_raw, case_expected, True) + +for case_name, case_raw, case_expected in PERMISSIVE_CASES: + _add_success_case(case_name, case_raw, case_expected, False) + +for case_name, case_raw in INVALID_CASES: + _add_invalid_case(case_name, case_raw) + + +if __name__ == "__main__": + unittest.main()