diff --git a/lib/ex_saml/core/binding.ex b/lib/ex_saml/core/binding.ex
index ba51eb4..2c89df1 100644
--- a/lib/ex_saml/core/binding.ex
+++ b/lib/ex_saml/core/binding.ex
@@ -66,7 +66,7 @@ defmodule ExSaml.Core.Binding do
saml_response
|> :base64.decode()
|> :zlib.unzip()
- |> to_charlist()
+ |> :binary.bin_to_list()
{xml, _} = :xmerl_scan.string(xml_data, namespace_conformant: true, allow_entities: false)
xml
@@ -77,11 +77,11 @@ defmodule ExSaml.Core.Binding do
xml_data =
try do
- :zlib.unzip(data) |> to_charlist()
+ :zlib.unzip(data) |> :binary.bin_to_list()
rescue
- _e -> to_charlist(data)
+ _e -> :binary.bin_to_list(data)
catch
- _kind, _reason -> to_charlist(data)
+ _kind, _reason -> :binary.bin_to_list(data)
end
{xml, _} = :xmerl_scan.string(xml_data, namespace_conformant: true, allow_entities: false)
diff --git a/lib/ex_saml/core/sp.ex b/lib/ex_saml/core/sp.ex
index d2215d6..f382448 100644
--- a/lib/ex_saml/core/sp.ex
+++ b/lib/ex_saml/core/sp.ex
@@ -517,7 +517,7 @@ defmodule ExSaml.Core.Sp do
assertion_xml = block_decrypt(to_string(algorithm), symmetric_key, cipher_value)
{assertion, _} =
- :xmerl_scan.string(to_charlist(assertion_xml),
+ :xmerl_scan.string(:binary.bin_to_list(assertion_xml),
namespace_conformant: true,
allow_entities: false
)
diff --git a/lib/ex_saml/metadata.ex b/lib/ex_saml/metadata.ex
index 6c6d3f7..c83b1cb 100644
--- a/lib/ex_saml/metadata.ex
+++ b/lib/ex_saml/metadata.ex
@@ -100,7 +100,7 @@ defmodule ExSaml.Metadata do
# ---------------------------------------------------------------------------
defp parse(xml) do
- charlist = String.to_charlist(xml)
+ charlist = :binary.bin_to_list(xml)
try do
{root, _rest} =
diff --git a/test/ex_saml/core/binding_test.exs b/test/ex_saml/core/binding_test.exs
index e7e9c7b..a36d155 100644
--- a/test/ex_saml/core/binding_test.exs
+++ b/test/ex_saml/core/binding_test.exs
@@ -14,10 +14,40 @@ defmodule ExSaml.Core.BindingTest do
Record.defrecord(:xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl"))
+ Record.defrecord(
+ :xmlAttribute,
+ Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl")
+ )
+
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
+ # Concatenates the text content of every direct xmlText child of `elem`
+ # (xmerl returns text as charlists of codepoints).
+ defp text_value(elem) do
+ elem
+ |> xmlElement(:content)
+ |> Enum.flat_map(fn
+ child when Record.is_record(child, :xmlElement) ->
+ child |> xmlElement(:content) |> Enum.flat_map(&extract_text/1)
+
+ other ->
+ extract_text(other)
+ end)
+ end
+
+ defp extract_text(node) when Record.is_record(node, :xmlText), do: xmlText(node, :value)
+ defp extract_text(_), do: []
+
+ defp attribute_value(elem, name) do
+ elem
+ |> xmlElement(:attributes)
+ |> Enum.find_value(fn attr ->
+ if xmlAttribute(attr, :name) == name, do: xmlAttribute(attr, :value)
+ end)
+ end
+
defp simple_request_element do
# Build a minimal xmerl element
xmlElement(
@@ -161,6 +191,105 @@ defmodule ExSaml.Core.BindingTest do
end
end
+ # ---------------------------------------------------------------------------
+ # decode_response/2 — UTF-8 character regression
+ #
+ # Reproduces a production crash where xmerl_scan rejected character 233 (é)
+ # because the XML byte stream had been pre-decoded into Unicode codepoints
+ # via `to_charlist/1` before being handed to xmerl. xmerl expects a list of
+ # raw UTF-8 bytes and does its own decoding. The fix passes the bytes via
+ # `:binary.bin_to_list/1`. These tests guard the regression across the full
+ # UTF-8 range (2-byte, 3-byte, 4-byte sequences) and across both decode
+ # paths (DEFLATE and plain base64), in text content and in attribute values.
+ # ---------------------------------------------------------------------------
+
+ describe "decode_response/2 — non-ASCII (UTF-8) regression" do
+ # {label, sample, expected_codepoints}
+ @utf8_samples [
+ # 2-byte UTF-8 — Latin-1 Supplement
+ {"french é (the prod case)", "Hélène", [?H, 233, ?l, 232, ?n, ?e]},
+ {"french é + space", "Émilie Côté", [?É, ?m, ?i, ?l, ?i, ?e, ?\s, ?C, ?ô, ?t, ?é]},
+ {"spanish ñ", "Núñez", [?N, ?ú, ?ñ, ?e, ?z]},
+ {"german ß", "Straße", [?S, ?t, ?r, ?a, ?ß, ?e]},
+ {"german umlaut ü", "Müller", [?M, ?ü, ?l, ?l, ?e, ?r]},
+ {"nordic ø", "Bjørn", [?B, ?j, ?ø, ?r, ?n]},
+ # 2-byte UTF-8 — Latin Extended
+ {"polish ł", "Łukasz", [?Ł, ?u, ?k, ?a, ?s, ?z]},
+ {"czech č", "Černý", [?Č, ?e, ?r, ?n, ?ý]},
+ # 3-byte UTF-8 — BMP
+ {"euro sign €", "10€", [?1, ?0, ?€]},
+ {"greek Ω", "ΩΑΘ", [?Ω, ?Α, ?Θ]},
+ {"cyrillic", "Иван", [?И, ?в, ?а, ?н]},
+ {"chinese", "中文", [?中, ?文]},
+ {"japanese", "日本語", [?日, ?本, ?語]},
+ # 4-byte UTF-8 — supplementary planes (emoji)
+ {"emoji 🎉", "party 🎉", [?p, ?a, ?r, ?t, ?y, ?\s, 0x1F389]}
+ ]
+
+ for {label, sample, expected} <- @utf8_samples do
+ test "decodes #{label} in element text (non-deflate)" do
+ sample = unquote(sample)
+ expected = unquote(expected)
+
+ xml = "" <> sample <> ""
+ b64 = Base.encode64(xml)
+
+ decoded = Binding.decode_response("", b64)
+
+ assert Record.is_record(decoded, :xmlElement)
+ assert xmlElement(decoded, :name) == :Response
+ assert text_value(decoded) == expected
+ end
+
+ test "decodes #{label} in attribute value (non-deflate)" do
+ sample = unquote(sample)
+ # We escape only the chars that need escaping in attribute values.
+ attr = sample |> String.replace("\"", """) |> String.replace("&", "&")
+ xml = " attr <> "\">x"
+ b64 = Base.encode64(xml)
+
+ decoded = Binding.decode_response("", b64)
+
+ assert Record.is_record(decoded, :xmlElement)
+ assert attribute_value(decoded, :label) == unquote(expected)
+ end
+
+ test "decodes #{label} in element text (DEFLATE)" do
+ sample = unquote(sample)
+ expected = unquote(expected)
+
+ xml = "" <> sample <> ""
+ b64 = xml |> :zlib.zip() |> Base.encode64()
+
+ decoded =
+ Binding.decode_response(
+ "urn:oasis:names:tc:SAML:2.0:bindings:URL-Encoding:DEFLATE",
+ b64
+ )
+
+ assert Record.is_record(decoded, :xmlElement)
+ assert text_value(decoded) == expected
+ end
+ end
+
+ test "decodes a realistic SAMLResponse with multiple accented AttributeValues" do
+ xml =
+ ~s(
+ ~s(xmlns="urn:oasis:names:tc:SAML:2.0:assertion">) <>
+ "" <>
+ ~s(Anaïs) <>
+ ~s(Lefèvre-Béchamp) <>
+ ~s(Müller — €10 🎉) <>
+ ""
+
+ b64 = Base.encode64(xml)
+ decoded = Binding.decode_response("", b64)
+
+ assert Record.is_record(decoded, :xmlElement)
+ assert xmlElement(decoded, :name) == :"samlp:Response"
+ end
+ end
+
# ---------------------------------------------------------------------------
# encode_http_post/4 with nonce
# ---------------------------------------------------------------------------
diff --git a/test/ex_saml/metadata_test.exs b/test/ex_saml/metadata_test.exs
index c6b50e2..e8121f5 100644
--- a/test/ex_saml/metadata_test.exs
+++ b/test/ex_saml/metadata_test.exs
@@ -38,6 +38,42 @@ defmodule ExSaml.MetadataTest do
assert {:error, %ValidationResult{errors: [%{code: :invalid_xml}]}} =
Metadata.validate("")
end
+
+ # Regression: parse/1 used to call String.to_charlist on the raw UTF-8
+ # binary, producing a list of codepoints. xmerl_scan expects raw UTF-8
+ # bytes and crashed with {:wfc_Legal_Character, {:bad_character, _}} on
+ # any non-ASCII character (e.g. an Organization name with an accent).
+ for {label, sample} <- [
+ {"latin-1 supplement (é)", "Société Élysée SAS"},
+ {"german umlaut (ü/ß)", "Müller Straße GmbH"},
+ {"polish (Ł)", "Łukasz Sp. z o.o."},
+ {"euro sign (€)", "10€ Org"},
+ {"greek (Ω)", "ΩΑΘ Foundation"},
+ {"cyrillic", "Иван Lab"},
+ {"chinese", "中文公司"},
+ {"emoji (🎉)", "Party 🎉 Inc."}
+ ] do
+ test "parses metadata containing #{label} without crashing" do
+ xml = """
+
+
+
+
+
+
+ #{unquote(sample)}
+ #{unquote(sample)}
+ https://example.com
+
+
+ """
+
+ assert {:ok, %ValidationResult{}} = Metadata.validate(xml)
+ end
+ end
end
describe "root element" do