From ab7bc139679917646255657fc00d2b1032844785 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:02:14 +0000 Subject: [PATCH 1/3] Initial plan From d246a1fd915f8d7e58e370741542594815fdfab5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:09:55 +0000 Subject: [PATCH 2/3] Add UTF-8 sanitization to prevent PostgreSQL encoding errors Co-authored-by: HNygard <168380+HNygard@users.noreply.github.com> --- .../src/class/ThreadEmailDatabaseSaver.php | 31 +++--- organizer/src/class/ThreadUtils.php | 52 ++++++++++ organizer/src/tests/ThreadUtilsUtf8Test.php | 99 +++++++++++++++++++ 3 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 organizer/src/tests/ThreadUtilsUtf8Test.php diff --git a/organizer/src/class/ThreadEmailDatabaseSaver.php b/organizer/src/class/ThreadEmailDatabaseSaver.php index 2ae0dfb0..a94b97e8 100644 --- a/organizer/src/class/ThreadEmailDatabaseSaver.php +++ b/organizer/src/class/ThreadEmailDatabaseSaver.php @@ -15,6 +15,7 @@ require_once __DIR__ . '/ThreadFolderManager.php'; require_once __DIR__ . '/ThreadStorageManager.php'; require_once __DIR__ . '/ThreadEmailProcessingErrorManager.php'; +require_once __DIR__ . '/ThreadUtils.php'; use Imap\ImapConnection; use Imap\ImapEmailProcessor; @@ -66,6 +67,9 @@ public function saveThreadEmails(string $folder): array { # Figure out which thread this email is part of $all_emails = $email->getEmailAddresses($rawEmail); + + // Sanitize email addresses to prevent UTF-8 encoding issues + $all_emails = array_map('sanitizeUtf8String', $all_emails); $email_identifier = date('Y-m-d__His', $email->timestamp) . '__' . md5($email->subject); @@ -254,6 +258,9 @@ private function emailExistsInDatabase(string $threadId, string $emailIdOld): bo * @return string UUID of the saved email */ private function saveEmailToDatabase(string $threadId, object $email, string $direction, string $filename, string $rawEmail, stdClass $imap_headers): string { + // Sanitize IMAP headers to ensure all text fields have valid UTF-8 + $sanitized_headers = sanitizeUtf8Recursive($imap_headers); + $query = " INSERT INTO thread_emails ( thread_id, @@ -278,8 +285,8 @@ private function saveEmailToDatabase(string $threadId, object $email, string $di ':email_type' => $direction, ':status_type' => ThreadEmailStatusType::UNKNOWN->value, ':status_text' => 'Uklassifisert', - ':imap_headers' => json_encode($imap_headers, JSON_UNESCAPED_UNICODE ^ JSON_UNESCAPED_SLASHES), - ':id_old' => $filename + ':imap_headers' => json_encode($sanitized_headers, JSON_UNESCAPED_UNICODE ^ JSON_UNESCAPED_SLASHES), + ':id_old' => sanitizeUtf8String($filename) ]; // Handle binary content separately @@ -321,10 +328,10 @@ public function saveAttachmentToDatabase(string $emailId, object $attachment, $c $params = [ ':email_id' => $emailId, - ':name' => $attachment->name, - ':filename' => $attachment->filename, - ':filetype' => $attachment->filetype, - ':location' => $attachment->location, + ':name' => sanitizeUtf8String($attachment->name), + ':filename' => sanitizeUtf8String($attachment->filename), + ':filetype' => sanitizeUtf8String($attachment->filetype), + ':location' => sanitizeUtf8String($attachment->location), ':status_type' => ThreadEmailStatusType::UNKNOWN->value, ':status_text' => 'uklassifisert-dok' ]; @@ -377,13 +384,13 @@ private function saveEmailProcessingError( string $folderName ): void { ThreadEmailProcessingErrorManager::saveEmailProcessingError( - $emailIdentifier, - $emailSubject, - $emailAddresses, - $errorType, - $errorMessage, + sanitizeUtf8String($emailIdentifier), + sanitizeUtf8String($emailSubject), + sanitizeUtf8String($emailAddresses), + sanitizeUtf8String($errorType), + sanitizeUtf8String($errorMessage), $suggestedThreadId, - $folderName + sanitizeUtf8String($folderName) ); } } diff --git a/organizer/src/class/ThreadUtils.php b/organizer/src/class/ThreadUtils.php index ec412066..4505668e 100644 --- a/organizer/src/class/ThreadUtils.php +++ b/organizer/src/class/ThreadUtils.php @@ -286,3 +286,55 @@ function getEmailCcAddressesFromImapHeaders($imapHeaders) { return extractAddressesFromEmailObjects($ccObjects); } + +/** + * Sanitize UTF-8 string by replacing invalid byte sequences with replacement character + * This prevents PostgreSQL UTF-8 encoding errors when inserting data from IMAP + * + * @param string $text Text to sanitize + * @return string Sanitized UTF-8 text + */ +function sanitizeUtf8String(string $text): string { + // mb_convert_encoding with 'UTF-8' to 'UTF-8' replaces invalid sequences + // The //IGNORE flag would skip invalid sequences, but we want to use the replacement character + $sanitized = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); + + // If mb_convert_encoding fails, try iconv with TRANSLIT to replace problematic characters + if ($sanitized === false || $sanitized === '') { + $sanitized = iconv('UTF-8', 'UTF-8//IGNORE', $text); + if ($sanitized === false) { + // Last resort: use regex to remove invalid UTF-8 sequences + $sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F]/u', '?', $text); + if ($sanitized === null) { + // If even regex fails, return empty string + return ''; + } + } + } + + return $sanitized; +} + +/** + * Recursively sanitize UTF-8 strings in an object or array + * This ensures all text data is safe for PostgreSQL UTF-8 encoding + * + * @param mixed $data Data to sanitize (object, array, or string) + * @return mixed Sanitized data + */ +function sanitizeUtf8Recursive($data) { + if (is_string($data)) { + return sanitizeUtf8String($data); + } elseif (is_array($data)) { + foreach ($data as $key => $value) { + $data[$key] = sanitizeUtf8Recursive($value); + } + return $data; + } elseif (is_object($data)) { + foreach ($data as $key => $value) { + $data->$key = sanitizeUtf8Recursive($value); + } + return $data; + } + return $data; +} diff --git a/organizer/src/tests/ThreadUtilsUtf8Test.php b/organizer/src/tests/ThreadUtilsUtf8Test.php new file mode 100644 index 00000000..9bc18d1c --- /dev/null +++ b/organizer/src/tests/ThreadUtilsUtf8Test.php @@ -0,0 +1,99 @@ +assertEquals($validUtf8, $result); + } + + public function testSanitizeUtf8String_InvalidSequence() { + // Invalid UTF-8 sequence: 0xC3 followed by space (0x20) instead of continuation byte + // This is the exact error from the issue: "invalid byte sequence for encoding "UTF8": 0xc3 0x20" + $invalidUtf8 = "Test" . chr(0xC3) . chr(0x20) . "string"; + $result = sanitizeUtf8String($invalidUtf8); + + // The result should be a valid UTF-8 string + $this->assertNotFalse(mb_check_encoding($result, 'UTF-8')); + // The invalid sequence should have been replaced or removed + $this->assertNotEquals($invalidUtf8, $result); + } + + public function testSanitizeUtf8String_MultipleInvalidSequences() { + // Multiple invalid UTF-8 sequences + $invalidUtf8 = chr(0xC3) . chr(0x20) . "test" . chr(0xFF) . chr(0xFE); + $result = sanitizeUtf8String($invalidUtf8); + + // The result should be a valid UTF-8 string + $this->assertNotFalse(mb_check_encoding($result, 'UTF-8')); + } + + public function testSanitizeUtf8String_EmptyString() { + $result = sanitizeUtf8String(""); + $this->assertEquals("", $result); + } + + public function testSanitizeUtf8Recursive_Array() { + $data = [ + 'valid' => 'Hello', + 'invalid' => "Test" . chr(0xC3) . chr(0x20) . "string" + ]; + + $result = sanitizeUtf8Recursive($data); + + // Valid string should remain unchanged + $this->assertEquals('Hello', $result['valid']); + // Invalid string should be sanitized + $this->assertNotFalse(mb_check_encoding($result['invalid'], 'UTF-8')); + $this->assertNotEquals($data['invalid'], $result['invalid']); + } + + public function testSanitizeUtf8Recursive_Object() { + $data = new stdClass(); + $data->valid = 'Hello'; + $data->invalid = "Test" . chr(0xC3) . chr(0x20) . "string"; + $data->nested = new stdClass(); + $data->nested->value = "Nested" . chr(0xFF); + + $result = sanitizeUtf8Recursive($data); + + // Valid string should remain unchanged + $this->assertEquals('Hello', $result->valid); + // Invalid strings should be sanitized + $this->assertNotFalse(mb_check_encoding($result->invalid, 'UTF-8')); + $this->assertNotFalse(mb_check_encoding($result->nested->value, 'UTF-8')); + } + + public function testSanitizeUtf8Recursive_MixedTypes() { + $data = [ + 'string' => "Test" . chr(0xC3) . chr(0x20), + 'number' => 42, + 'boolean' => true, + 'null' => null, + 'array' => ['nested' => "Invalid" . chr(0xFF)] + ]; + + $result = sanitizeUtf8Recursive($data); + + // Non-string types should remain unchanged + $this->assertEquals(42, $result['number']); + $this->assertEquals(true, $result['boolean']); + $this->assertEquals(null, $result['null']); + // Strings should be sanitized + $this->assertNotFalse(mb_check_encoding($result['string'], 'UTF-8')); + $this->assertNotFalse(mb_check_encoding($result['array']['nested'], 'UTF-8')); + } + + public function testSanitizeUtf8String_NorwegianCharacters() { + // Test with Norwegian characters (should remain valid) + $norwegianText = "Snåsa kommune - Innsyn i håndskrevet opptellingsdata"; + $result = sanitizeUtf8String($norwegianText); + $this->assertEquals($norwegianText, $result); + $this->assertNotFalse(mb_check_encoding($result, 'UTF-8')); + } +} From c9d7d26d59e066e689ec0d02b0e2ffc423c26b97 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:12:19 +0000 Subject: [PATCH 3/3] Address code review feedback: improve UTF-8 sanitization logic Co-authored-by: HNygard <168380+HNygard@users.noreply.github.com> --- organizer/src/class/ThreadUtils.php | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/organizer/src/class/ThreadUtils.php b/organizer/src/class/ThreadUtils.php index 4505668e..0ed284c8 100644 --- a/organizer/src/class/ThreadUtils.php +++ b/organizer/src/class/ThreadUtils.php @@ -295,16 +295,22 @@ function getEmailCcAddressesFromImapHeaders($imapHeaders) { * @return string Sanitized UTF-8 text */ function sanitizeUtf8String(string $text): string { + // Handle empty strings early + if ($text === '') { + return ''; + } + // mb_convert_encoding with 'UTF-8' to 'UTF-8' replaces invalid sequences - // The //IGNORE flag would skip invalid sequences, but we want to use the replacement character $sanitized = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); - // If mb_convert_encoding fails, try iconv with TRANSLIT to replace problematic characters - if ($sanitized === false || $sanitized === '') { - $sanitized = iconv('UTF-8', 'UTF-8//IGNORE', $text); + // If mb_convert_encoding fails, try iconv with SUBSTITUTE to replace problematic characters + if ($sanitized === false) { + $sanitized = @iconv('UTF-8', 'UTF-8//IGNORE', $text); if ($sanitized === false) { - // Last resort: use regex to remove invalid UTF-8 sequences - $sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F]/u', '?', $text); + // Last resort: manually filter out invalid bytes + // Remove control characters and other problematic bytes + // Don't use 'u' modifier since we're dealing with potentially invalid UTF-8 + $sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/', '', $text); if ($sanitized === null) { // If even regex fails, return empty string return '';