From 128b3519b712127291ba203d81a68d9ef23e8aa4 Mon Sep 17 00:00:00 2001 From: docJerem Date: Wed, 6 May 2026 09:58:54 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20/=20UTF-8=20characters=20i?= =?UTF-8?q?n=20SAMLResponse=20rejected=20by=20xmerl=5Fscan?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decode_response/2 (and the encrypted-assertion + metadata parse paths) piped the XML binary through to_charlist/1 before xmerl_scan.string/2. to_charlist decodes UTF-8 into Unicode codepoints, but xmerl_scan expects raw UTF-8 bytes and does its own decoding — feeding it codepoints made it reject any non-ASCII char with {:wfc_Legal_Character, {:bad_character, _}}, crashing /api/sp/consume on assertions containing accents. Switches to :binary.bin_to_list/1 at the four call sites and adds regression coverage across 2/3/4-byte UTF-8 sequences in element text, attribute values, DEFLATE and non-DEFLATE response paths, plus metadata OrganizationName. --- lib/ex_saml/core/binding.ex | 8 +- lib/ex_saml/core/sp.ex | 2 +- lib/ex_saml/metadata.ex | 2 +- test/ex_saml/core/binding_test.exs | 129 +++++++++++++++++++++++++++++ test/ex_saml/metadata_test.exs | 36 ++++++++ 5 files changed, 171 insertions(+), 6 deletions(-) diff --git a/lib/ex_saml/core/binding.ex b/lib/ex_saml/core/binding.ex index ba51eb4..2c89df1 100644 --- a/lib/ex_saml/core/binding.ex +++ b/lib/ex_saml/core/binding.ex @@ -66,7 +66,7 @@ defmodule ExSaml.Core.Binding do saml_response |> :base64.decode() |> :zlib.unzip() - |> to_charlist() + |> :binary.bin_to_list() {xml, _} = :xmerl_scan.string(xml_data, namespace_conformant: true, allow_entities: false) xml @@ -77,11 +77,11 @@ defmodule ExSaml.Core.Binding do xml_data = try do - :zlib.unzip(data) |> to_charlist() + :zlib.unzip(data) |> :binary.bin_to_list() rescue - _e -> to_charlist(data) + _e -> :binary.bin_to_list(data) catch - _kind, _reason -> to_charlist(data) + _kind, _reason -> :binary.bin_to_list(data) end {xml, _} = :xmerl_scan.string(xml_data, namespace_conformant: true, allow_entities: false) diff --git a/lib/ex_saml/core/sp.ex b/lib/ex_saml/core/sp.ex index d2215d6..f382448 100644 --- a/lib/ex_saml/core/sp.ex +++ b/lib/ex_saml/core/sp.ex @@ -517,7 +517,7 @@ defmodule ExSaml.Core.Sp do assertion_xml = block_decrypt(to_string(algorithm), symmetric_key, cipher_value) {assertion, _} = - :xmerl_scan.string(to_charlist(assertion_xml), + :xmerl_scan.string(:binary.bin_to_list(assertion_xml), namespace_conformant: true, allow_entities: false ) diff --git a/lib/ex_saml/metadata.ex b/lib/ex_saml/metadata.ex index 6c6d3f7..c83b1cb 100644 --- a/lib/ex_saml/metadata.ex +++ b/lib/ex_saml/metadata.ex @@ -100,7 +100,7 @@ defmodule ExSaml.Metadata do # --------------------------------------------------------------------------- defp parse(xml) do - charlist = String.to_charlist(xml) + charlist = :binary.bin_to_list(xml) try do {root, _rest} = diff --git a/test/ex_saml/core/binding_test.exs b/test/ex_saml/core/binding_test.exs index e7e9c7b..a36d155 100644 --- a/test/ex_saml/core/binding_test.exs +++ b/test/ex_saml/core/binding_test.exs @@ -14,10 +14,40 @@ defmodule ExSaml.Core.BindingTest do Record.defrecord(:xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl")) + Record.defrecord( + :xmlAttribute, + Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl") + ) + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- + # Concatenates the text content of every direct xmlText child of `elem` + # (xmerl returns text as charlists of codepoints). + defp text_value(elem) do + elem + |> xmlElement(:content) + |> Enum.flat_map(fn + child when Record.is_record(child, :xmlElement) -> + child |> xmlElement(:content) |> Enum.flat_map(&extract_text/1) + + other -> + extract_text(other) + end) + end + + defp extract_text(node) when Record.is_record(node, :xmlText), do: xmlText(node, :value) + defp extract_text(_), do: [] + + defp attribute_value(elem, name) do + elem + |> xmlElement(:attributes) + |> Enum.find_value(fn attr -> + if xmlAttribute(attr, :name) == name, do: xmlAttribute(attr, :value) + end) + end + defp simple_request_element do # Build a minimal xmerl element xmlElement( @@ -161,6 +191,105 @@ defmodule ExSaml.Core.BindingTest do end end + # --------------------------------------------------------------------------- + # decode_response/2 — UTF-8 character regression + # + # Reproduces a production crash where xmerl_scan rejected character 233 (é) + # because the XML byte stream had been pre-decoded into Unicode codepoints + # via `to_charlist/1` before being handed to xmerl. xmerl expects a list of + # raw UTF-8 bytes and does its own decoding. The fix passes the bytes via + # `:binary.bin_to_list/1`. These tests guard the regression across the full + # UTF-8 range (2-byte, 3-byte, 4-byte sequences) and across both decode + # paths (DEFLATE and plain base64), in text content and in attribute values. + # --------------------------------------------------------------------------- + + describe "decode_response/2 — non-ASCII (UTF-8) regression" do + # {label, sample, expected_codepoints} + @utf8_samples [ + # 2-byte UTF-8 — Latin-1 Supplement + {"french é (the prod case)", "Hélène", [?H, 233, ?l, 232, ?n, ?e]}, + {"french é + space", "Émilie Côté", [?É, ?m, ?i, ?l, ?i, ?e, ?\s, ?C, ?ô, ?t, ?é]}, + {"spanish ñ", "Núñez", [?N, ?ú, ?ñ, ?e, ?z]}, + {"german ß", "Straße", [?S, ?t, ?r, ?a, ?ß, ?e]}, + {"german umlaut ü", "Müller", [?M, ?ü, ?l, ?l, ?e, ?r]}, + {"nordic ø", "Bjørn", [?B, ?j, ?ø, ?r, ?n]}, + # 2-byte UTF-8 — Latin Extended + {"polish ł", "Łukasz", [?Ł, ?u, ?k, ?a, ?s, ?z]}, + {"czech č", "Černý", [?Č, ?e, ?r, ?n, ?ý]}, + # 3-byte UTF-8 — BMP + {"euro sign €", "10€", [?1, ?0, ?€]}, + {"greek Ω", "ΩΑΘ", [?Ω, ?Α, ?Θ]}, + {"cyrillic", "Иван", [?И, ?в, ?а, ?н]}, + {"chinese", "中文", [?中, ?文]}, + {"japanese", "日本語", [?日, ?本, ?語]}, + # 4-byte UTF-8 — supplementary planes (emoji) + {"emoji 🎉", "party 🎉", [?p, ?a, ?r, ?t, ?y, ?\s, 0x1F389]} + ] + + for {label, sample, expected} <- @utf8_samples do + test "decodes #{label} in element text (non-deflate)" do + sample = unquote(sample) + expected = unquote(expected) + + xml = "" <> sample <> "" + b64 = Base.encode64(xml) + + decoded = Binding.decode_response("", b64) + + assert Record.is_record(decoded, :xmlElement) + assert xmlElement(decoded, :name) == :Response + assert text_value(decoded) == expected + end + + test "decodes #{label} in attribute value (non-deflate)" do + sample = unquote(sample) + # We escape only the chars that need escaping in attribute values. + attr = sample |> String.replace("\"", """) |> String.replace("&", "&") + xml = " attr <> "\">x" + b64 = Base.encode64(xml) + + decoded = Binding.decode_response("", b64) + + assert Record.is_record(decoded, :xmlElement) + assert attribute_value(decoded, :label) == unquote(expected) + end + + test "decodes #{label} in element text (DEFLATE)" do + sample = unquote(sample) + expected = unquote(expected) + + xml = "" <> sample <> "" + b64 = xml |> :zlib.zip() |> Base.encode64() + + decoded = + Binding.decode_response( + "urn:oasis:names:tc:SAML:2.0:bindings:URL-Encoding:DEFLATE", + b64 + ) + + assert Record.is_record(decoded, :xmlElement) + assert text_value(decoded) == expected + end + end + + test "decodes a realistic SAMLResponse with multiple accented AttributeValues" do + xml = + ~s( + ~s(xmlns="urn:oasis:names:tc:SAML:2.0:assertion">) <> + "" <> + ~s(Anaïs) <> + ~s(Lefèvre-Béchamp) <> + ~s(Müller — €10 🎉) <> + "" + + b64 = Base.encode64(xml) + decoded = Binding.decode_response("", b64) + + assert Record.is_record(decoded, :xmlElement) + assert xmlElement(decoded, :name) == :"samlp:Response" + end + end + # --------------------------------------------------------------------------- # encode_http_post/4 with nonce # --------------------------------------------------------------------------- diff --git a/test/ex_saml/metadata_test.exs b/test/ex_saml/metadata_test.exs index c6b50e2..e8121f5 100644 --- a/test/ex_saml/metadata_test.exs +++ b/test/ex_saml/metadata_test.exs @@ -38,6 +38,42 @@ defmodule ExSaml.MetadataTest do assert {:error, %ValidationResult{errors: [%{code: :invalid_xml}]}} = Metadata.validate("") end + + # Regression: parse/1 used to call String.to_charlist on the raw UTF-8 + # binary, producing a list of codepoints. xmerl_scan expects raw UTF-8 + # bytes and crashed with {:wfc_Legal_Character, {:bad_character, _}} on + # any non-ASCII character (e.g. an Organization name with an accent). + for {label, sample} <- [ + {"latin-1 supplement (é)", "Société Élysée SAS"}, + {"german umlaut (ü/ß)", "Müller Straße GmbH"}, + {"polish (Ł)", "Łukasz Sp. z o.o."}, + {"euro sign (€)", "10€ Org"}, + {"greek (Ω)", "ΩΑΘ Foundation"}, + {"cyrillic", "Иван Lab"}, + {"chinese", "中文公司"}, + {"emoji (🎉)", "Party 🎉 Inc."} + ] do + test "parses metadata containing #{label} without crashing" do + xml = """ + + + + + + + #{unquote(sample)} + #{unquote(sample)} + https://example.com + + + """ + + assert {:ok, %ValidationResult{}} = Metadata.validate(xml) + end + end end describe "root element" do