diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5eb15d8..235298b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,22 +14,10 @@ jobs:
fail-fast: false
matrix:
php_version:
- - '8.0'
- - '8.1'
- - '8.2'
- - '8.3'
- '8.4'
dependencies:
- 'default'
include:
- - php_version: '8.0'
- dependencies: 'lowest'
- - php_version: '8.1'
- dependencies: 'lowest'
- - php_version: '8.2'
- dependencies: 'lowest'
- - php_version: '8.3'
- dependencies: 'lowest'
- php_version: '8.4'
dependencies: 'lowest'
steps:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 947d5f6..e6712d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
## Unreleased
+* Use native `HTMLDocument` parsing instead of masterminds/html5
+* Drop support for PHP < 8.4 (to allow us to use native HTML parsing)
+
+## 1.3.3 / 2026-03-10
+
+* Fix handling and reporting parse errors when parsing an html table.
+
## 1.3.2 / 2025-05-06
* Support PHP 8.4 (thanks @mharmuth)
diff --git a/composer.json b/composer.json
index f46a2ed..48efb25 100644
--- a/composer.json
+++ b/composer.json
@@ -13,8 +13,7 @@
"minimum-stability": "stable",
"require": {
"behat/gherkin": ">=2.0.0 <5.0.0",
- "masterminds/html5": "^2.7.5",
- "php": "~8.0.0 || ~8.1.0 || ~8.2.0 || ~8.3.0 || ~8.4.0",
+ "php": ">= 8.4 < 8.5",
"ext-dom": "*",
"ext-SimpleXML": "*",
"ext-libxml": "*"
diff --git a/src/TableParser/HTML/HTMLStringTableParser.php b/src/TableParser/HTML/HTMLStringTableParser.php
index d309d97..0b56345 100644
--- a/src/TableParser/HTML/HTMLStringTableParser.php
+++ b/src/TableParser/HTML/HTMLStringTableParser.php
@@ -7,9 +7,13 @@
namespace Ingenerator\BehatTableAssert\TableParser\HTML;
+use Behat\Gherkin\Node\TableNode;
+use Dom\HTMLDocument;
+use Dom\HTMLElement;
use Ingenerator\BehatTableAssert\TableNode\PaddedTableNode;
use LibXMLError;
use Masterminds\HTML5;
+use function Dom\import_simplexml;
/**
* Parses an HTML string for a
element into a TableNode. The table must have a single row
@@ -90,33 +94,43 @@ public function parse($html)
*/
protected function parseHTMLString($html)
{
- $old_use_internal_errors = \libxml_use_internal_errors(TRUE);
try {
- $html5 = new HTML5();
- $dom = $html5->loadHTML(
- ''
- .' '
- .''.\trim($html).''
- .''
+ set_error_handler(
+ fn($errno, $errstr) => throw new \InvalidArgumentException(
+ sprintf("Invalid HTML: %s\n\n===HTML===\n%s", $errstr, $html),
+ ),
);
- $table_elem = $dom->getElementsByTagName('body')->item(0)->firstChild;
- $table = \simplexml_import_dom($table_elem);
- if ($errors = \libxml_get_errors()) {
- $this->throwInvalidHTMLException($html, $errors);
- }
+ $dom = HTMLDocument::createFromString(
+ sprintf(
+ <<<'HTML'
+
+
+
+ %s
+
+ HTML,
+ trim($html),
+ ),
+ );
} finally {
- \libxml_clear_errors();
- \libxml_use_internal_errors($old_use_internal_errors);
+ restore_error_handler();
+ }
+ $table_elem = $dom->getElementsByTagName('body')->item(0)->firstChild;
+
+ if (!$table_elem instanceof HTMLElement) {
+ throw new \InvalidArgumentException(
+ sprintf("Expected html root element but got %s\n\n===HTML===\n%s", get_debug_type($table_elem), $html),
+ );
}
+ $table = \simplexml_import_dom($table_elem);
return $table;
}
/**
- * @param string $html
- * @param LibXMLError[] $errors
+ * @deprecated no longer called by the library
*/
protected function throwInvalidHTMLException($html, $errors)
{
@@ -224,7 +238,7 @@ protected function findCellTextValues(\SimpleXMLElement $table_row)
*/
protected function parseCellText(\SimpleXmlElement $cell)
{
- $text = \trim(\preg_replace('/\s+/', ' ', \dom_import_simplexml($cell)->textContent));
+ $text = \trim(\preg_replace('/\s+/', ' ', import_simplexml($cell)->textContent));
if ($prefix = (string) $cell['data-behat-table-prefix']) {
$text = $prefix.' '.$text;
diff --git a/test/TableParser/HTML/HTMLStringTableParserTest.php b/test/TableParser/HTML/HTMLStringTableParserTest.php
index 2303175..d29b8df 100644
--- a/test/TableParser/HTML/HTMLStringTableParserTest.php
+++ b/test/TableParser/HTML/HTMLStringTableParserTest.php
@@ -35,37 +35,16 @@ public function test_it_throws_when_parsing_non_or_empty_string($value)
}
/**
- * @testWith ["random", false]
- * ["random", true]
- * ["
", false]
- * ["", false]
+ * @testWith ["random text", "Expected html root element but got Dom\\Text"]
+ * ["<17sd>illegal tag name17sd>
", "Invalid HTML: Dom\\HTMLDocument::createFromString(): tokenizer error invalid-first-character-of-tag-name"]
+ * ["
", "Expected a but got div"]
*/
- public function test_it_always_restores_state_of_libxml_error_handling(
- $html,
- $use_errors_before
- ) {
- $old_setting = \libxml_use_internal_errors($use_errors_before);
- try {
- $this->newSubject()->parse($html);
- } catch (\Exception $e) { /* ignore */
- }
- $errors_after = \libxml_get_errors();
- $use_errors_after = \libxml_use_internal_errors($old_setting);
-
- $this->assertSame([], $errors_after, 'Should clear libxml errors');
- $this->assertEquals(
- $use_errors_before,
- $use_errors_after,
- 'Should restore libxml_use_internal_errors'
- );
- }
-
- public function test_it_throws_when_parsing_html_that_is_not_a_table()
+ public function test_it_throws_when_parsing_html_that_is_not_a_table_or_not_valid(string $input, string $expect_msg)
{
$this->expectException(\InvalidArgumentException::class);
- $this->expectExceptionMessage('Expected a ');
+ $this->expectExceptionMessage($expect_msg);
- $this->newSubject()->parse('
');
+ $this->newSubject()->parse($input);
}
public function test_it_throws_when_parsing_table_without_thead()
@@ -224,7 +203,7 @@ public function provider_valid_html_tables()
''.
'Header Date '.
''.
- 'Cell1 30 Aug 2016 '.
+ 'Cell1 30 Aug 2016 '.
'
',
[
['Header', 'Date'],