LinkFinder/test_parser.py at master · scottmmjackson/LinkFinder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
from pathlib import Path

from linkfinder import extract_script_urls, parser_file, regex_str

_FIXTURES = Path(__file__).parent / "tests" / "fixtures"


def get_links(source: str) -> list[str]:
    """Return the list of unique endpoint links found in source (CLI mode)."""
    return [ep["link"] for ep in parser_file(source, regex_str, mode=0)]


def test_parser_cli() -> None:
    # Full URLs
    assert get_links('"http://example.com"') == ["http://example.com"]
    assert get_links('"smb://example.com"') == ["smb://example.com"]
    assert get_links('"https://www.example.co.us"') == ["https://www.example.co.us"]

    # Absolute / relative paths
    assert get_links('"/path/to/file"') == ["/path/to/file"]
    assert get_links('"../path/to/file"') == ["../path/to/file"]
    assert get_links('"./path/to/file"') == ["./path/to/file"]
    assert get_links('"/user/create.action?user=Test"') == ["/user/create.action?user=Test"]
    assert get_links('"/api/create.php?user=test&pass=test#home"') == [
        "/api/create.php?user=test&pass=test#home"
    ]
    assert get_links('"/wrong/file/test<>b"') == []

    # Relative paths with extensions
    assert get_links('"api/create.php"') == ["api/create.php"]
    assert get_links('"api/create.php?user=test"') == ["api/create.php?user=test"]
    assert get_links('"api/create.php?user=test&pass=test"') == [
        "api/create.php?user=test&pass=test"
    ]
    assert get_links('"api/create.php?user=test#home"') == ["api/create.php?user=test#home"]
    assert get_links('"user/create.action?user=Test"') == ["user/create.action?user=Test"]
    assert get_links('"user/create.notaext?user=Test"') == []

    assert get_links('"/path/to/file"') == ["/path/to/file"]
    assert get_links('"../path/to/file"') == ["../path/to/file"]
    assert get_links('"./path/to/file"') == ["./path/to/file"]
    assert get_links('"/wrong/file/test<>b"') == []

    # REST API endpoints (no extension)
    assert get_links('"api/user"') == ["api/user"]
    assert get_links('"v1/create"') == ["v1/create"]
    assert get_links('"api/v1/user/2"') == ["api/v1/user/2"]
    assert get_links('"api/v1/search?text=Test Hello"') == ["api/v1/search?text=Test Hello"]

    # Filenames with known extensions
    assert get_links('"test_1.json"') == ["test_1.json"]
    assert get_links('"test2.aspx?arg1=tmp1+tmp2&arg2=tmp3"') == [
        "test2.aspx?arg1=tmp1+tmp2&arg2=tmp3"
    ]
    assert get_links('"addUser.action"') == ["addUser.action"]
    assert get_links('"main.js"') == ["main.js"]
    assert get_links('"index.html"') == ["index.html"]
    assert get_links('"robots.txt"') == ["robots.txt"]
    assert get_links('"users.xml"') == ["users.xml"]
    assert get_links('"UserModel.name"') == []

    # Multi-dot filenames
    assert get_links('"app/admin/admin.controller.js"') == ["app/admin/admin.controller.js"]
    assert get_links('"services/customer.services.js"') == ["services/customer.services.js"]


def test_parser_cli_multi() -> None:
    assert set(get_links('href="http://example.com";href="/api/create.php"')) == {
        "http://example.com",
        "/api/create.php",
    }


def test_parser_unique() -> None:
    """Duplicate links should be deduplicated."""
    assert get_links('href="http://example.com";document.window.location="http://example.com"') == [
        "http://example.com"
    ]

    assert set(
        get_links('href="http://example.com";<img src="http://example.com">;href="/api/create.php"')
    ) == {"http://example.com", "/api/create.php"}


def test_b6_multidot_filename() -> None:
    """B6 regression: bare filenames with dots in the stem must be matched."""
    assert get_links('"some-name.anotherone.js"') == ["some-name.anotherone.js"]
    assert get_links('"my.lib.min.js"') == ["my.lib.min.js"]
    # Non-whitelisted extension should still return nothing
    assert get_links('"UserModel.name"') == []


def test_b7_ipv4_url() -> None:
    """B7 regression: HTTP(S) URLs with IPv4 addresses must be matched."""
    assert get_links('"http://127.0.0.1/path"') == ["http://127.0.0.1/path"]
    assert get_links('"https://192.168.1.1/api/v1"') == ["https://192.168.1.1/api/v1"]
    assert get_links('"http://10.0.0.1"') == ["http://10.0.0.1"]
    # Named hosts must still work
    assert get_links('"http://example.com"') == ["http://example.com"]


# ---------------------------------------------------------------------------
# Real minified bundle fixture
# ---------------------------------------------------------------------------

# Endpoints we must find in bundle.min.js. The set is intentionally not
# exhaustive — it covers each regex branch plus the B3/B6/B7 regressions.
_EXPECTED_IN_BUNDLE = {
    "https://api.example.com/v2",  # full URL
    "/api/v1/auth/token",  # absolute path
    "http://10.0.0.5/healthcheck",  # B7: IPv4 URL
    "api/upload.php",  # relative with extension
    "//cdn.example.com/assets/app.bundle.min.js",  # protocol-relative
    "main.js",  # bare filename
    "vendor.bundle.min.js",  # B6: multi-dot bare filename
    "polyfills.bundle.min.js",  # B6: multi-dot bare filename
    "/api/v1/auth/login",  # absolute path
    "/api/v1/auth/logout",  # absolute path
    "/api/v1/user/profile",  # absolute path
    "/api/v2/search?q=&limit=20&offset=0",  # B3: raw & not &amp;
    "/api/v1/admin/users",  # absolute path
    "api/v1/password/reset",  # REST API
    "/api/v1/session/check",  # absolute path
}

# Known false positive produced by the REST API branch on "application/json".
# Asserted explicitly to document expected regex behaviour, not a bug.
_KNOWN_FALSE_POSITIVES = {"application/json"}


def test_minified_bundle_cli_mode() -> None:
    """All expected endpoints are found in the minified fixture (mode=0, no beautifier)."""
    content = (_FIXTURES / "bundle.min.js").read_text()
    found = set(get_links(content))
    assert _EXPECTED_IN_BUNDLE <= found, f"Missing endpoints: {_EXPECTED_IN_BUNDLE - found}"


def test_minified_bundle_html_mode() -> None:
    """Mode=1 beautifies the source and populates context for each endpoint."""
    content = (_FIXTURES / "bundle.min.js").read_text()
    endpoints = parser_file(content, regex_str, mode=1)
    found_links = {ep["link"] for ep in endpoints}

    assert _EXPECTED_IN_BUNDLE <= found_links, (
        f"Missing endpoints: {_EXPECTED_IN_BUNDLE - found_links}"
    )
    # Every mode=1 result must carry surrounding context.
    assert all("context" in ep for ep in endpoints), "Some mode=1 endpoints are missing 'context'"


def test_minified_bundle_b3_no_html_escape() -> None:
    """B3: query-string & must appear verbatim, never as &amp;."""
    content = (_FIXTURES / "bundle.min.js").read_text()
    found = set(get_links(content))
    assert "/api/v2/search?q=&limit=20&offset=0" in found
    assert "/api/v2/search?q=&amp;limit=20&amp;offset=0" not in found


def test_minified_bundle_known_false_positives() -> None:
    """Document false positives so regressions are caught if the regex tightens."""
    content = (_FIXTURES / "bundle.min.js").read_text()
    found = set(get_links(content))
    assert _KNOWN_FALSE_POSITIVES <= found, (
        "Expected false-positive set changed — update test if intentional"
    )


# ---------------------------------------------------------------------------
# extract_script_urls (B5)
# ---------------------------------------------------------------------------


def test_extract_script_urls_quoted() -> None:
    """Quoted src attributes are extracted."""
    html = '<html><script src="/js/app.js"></script></html>'
    assert extract_script_urls(html) == ["/js/app.js"]


def test_extract_script_urls_unquoted() -> None:
    """B5: unquoted src attributes (valid HTML5) must be extracted."""
    html = "<html><script src=/js/vendor.js></script></html>"
    assert extract_script_urls(html) == ["/js/vendor.js"]


def test_extract_script_urls_mixed() -> None:
    """Mixed quoted/unquoted/protocol-relative srcs are all captured in order."""
    html = (
        "<script src=/js/a.js></script>"
        '<script src="/js/b.js"></script>'
        "<script src=//cdn.example.com/c.js></script>"
    )
    assert extract_script_urls(html) == [
        "/js/a.js",
        "/js/b.js",
        "//cdn.example.com/c.js",
    ]


def test_extract_script_urls_no_src() -> None:
    """Inline <script> tags (no src) produce no output."""
    html = "<script>var x = 1;</script>"
    assert extract_script_urls(html) == []