From ab7bc139679917646255657fc00d2b1032844785 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 23 Jan 2026 14:02:14 +0000
Subject: [PATCH 1/3] Initial plan


From d246a1fd915f8d7e58e370741542594815fdfab5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 23 Jan 2026 14:09:55 +0000
Subject: [PATCH 2/3] Add UTF-8 sanitization to prevent PostgreSQL encoding
 errors

Co-authored-by: HNygard <168380+HNygard@users.noreply.github.com>
---
 .../src/class/ThreadEmailDatabaseSaver.php    | 31 +++---
 organizer/src/class/ThreadUtils.php           | 52 ++++++++++
 organizer/src/tests/ThreadUtilsUtf8Test.php   | 99 +++++++++++++++++++
 3 files changed, 170 insertions(+), 12 deletions(-)
 create mode 100644 organizer/src/tests/ThreadUtilsUtf8Test.php

diff --git a/organizer/src/class/ThreadEmailDatabaseSaver.php b/organizer/src/class/ThreadEmailDatabaseSaver.php
index 2ae0dfb0..a94b97e8 100644
--- a/organizer/src/class/ThreadEmailDatabaseSaver.php
+++ b/organizer/src/class/ThreadEmailDatabaseSaver.php
@@ -15,6 +15,7 @@
 require_once __DIR__ . '/ThreadFolderManager.php';
 require_once __DIR__ . '/ThreadStorageManager.php';
 require_once __DIR__ . '/ThreadEmailProcessingErrorManager.php';
+require_once __DIR__ . '/ThreadUtils.php';
 
 use Imap\ImapConnection;
 use Imap\ImapEmailProcessor;
@@ -66,6 +67,9 @@ public function saveThreadEmails(string $folder): array {
 
                 # Figure out which thread this email is part of
                 $all_emails = $email->getEmailAddresses($rawEmail);
+                
+                // Sanitize email addresses to prevent UTF-8 encoding issues
+                $all_emails = array_map('sanitizeUtf8String', $all_emails);
 
                 $email_identifier = date('Y-m-d__His', $email->timestamp) . '__' . md5($email->subject);
                 
@@ -254,6 +258,9 @@ private function emailExistsInDatabase(string $threadId, string $emailIdOld): bo
      * @return string UUID of the saved email
      */
     private function saveEmailToDatabase(string $threadId, object $email, string $direction, string $filename, string $rawEmail, stdClass $imap_headers): string {
+        // Sanitize IMAP headers to ensure all text fields have valid UTF-8
+        $sanitized_headers = sanitizeUtf8Recursive($imap_headers);
+        
         $query = "
             INSERT INTO thread_emails (
                 thread_id, 
@@ -278,8 +285,8 @@ private function saveEmailToDatabase(string $threadId, object $email, string $di
             ':email_type' => $direction,
             ':status_type' => ThreadEmailStatusType::UNKNOWN->value,
             ':status_text' => 'Uklassifisert',
-            ':imap_headers' => json_encode($imap_headers, JSON_UNESCAPED_UNICODE ^ JSON_UNESCAPED_SLASHES),
-            ':id_old' => $filename
+            ':imap_headers' => json_encode($sanitized_headers, JSON_UNESCAPED_UNICODE ^ JSON_UNESCAPED_SLASHES),
+            ':id_old' => sanitizeUtf8String($filename)
         ];
         
         // Handle binary content separately
@@ -321,10 +328,10 @@ public function saveAttachmentToDatabase(string $emailId, object $attachment, $c
         
         $params = [
             ':email_id' => $emailId,
-            ':name' => $attachment->name,
-            ':filename' => $attachment->filename,
-            ':filetype' => $attachment->filetype,
-            ':location' => $attachment->location,
+            ':name' => sanitizeUtf8String($attachment->name),
+            ':filename' => sanitizeUtf8String($attachment->filename),
+            ':filetype' => sanitizeUtf8String($attachment->filetype),
+            ':location' => sanitizeUtf8String($attachment->location),
             ':status_type' => ThreadEmailStatusType::UNKNOWN->value,
             ':status_text' => 'uklassifisert-dok'
         ];
@@ -377,13 +384,13 @@ private function saveEmailProcessingError(
         string $folderName
     ): void {
         ThreadEmailProcessingErrorManager::saveEmailProcessingError(
-            $emailIdentifier,
-            $emailSubject,
-            $emailAddresses,
-            $errorType,
-            $errorMessage,
+            sanitizeUtf8String($emailIdentifier),
+            sanitizeUtf8String($emailSubject),
+            sanitizeUtf8String($emailAddresses),
+            sanitizeUtf8String($errorType),
+            sanitizeUtf8String($errorMessage),
             $suggestedThreadId,
-            $folderName
+            sanitizeUtf8String($folderName)
         );
     }
 }
diff --git a/organizer/src/class/ThreadUtils.php b/organizer/src/class/ThreadUtils.php
index ec412066..4505668e 100644
--- a/organizer/src/class/ThreadUtils.php
+++ b/organizer/src/class/ThreadUtils.php
@@ -286,3 +286,55 @@ function getEmailCcAddressesFromImapHeaders($imapHeaders) {
     
     return extractAddressesFromEmailObjects($ccObjects);
 }
+
+/**
+ * Sanitize UTF-8 string by replacing invalid byte sequences with replacement character
+ * This prevents PostgreSQL UTF-8 encoding errors when inserting data from IMAP
+ * 
+ * @param string $text Text to sanitize
+ * @return string Sanitized UTF-8 text
+ */
+function sanitizeUtf8String(string $text): string {
+    // mb_convert_encoding with 'UTF-8' to 'UTF-8' replaces invalid sequences
+    // The //IGNORE flag would skip invalid sequences, but we want to use the replacement character
+    $sanitized = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
+    
+    // If mb_convert_encoding fails, try iconv with TRANSLIT to replace problematic characters
+    if ($sanitized === false || $sanitized === '') {
+        $sanitized = iconv('UTF-8', 'UTF-8//IGNORE', $text);
+        if ($sanitized === false) {
+            // Last resort: use regex to remove invalid UTF-8 sequences
+            $sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F]/u', '?', $text);
+            if ($sanitized === null) {
+                // If even regex fails, return empty string
+                return '';
+            }
+        }
+    }
+    
+    return $sanitized;
+}
+
+/**
+ * Recursively sanitize UTF-8 strings in an object or array
+ * This ensures all text data is safe for PostgreSQL UTF-8 encoding
+ * 
+ * @param mixed $data Data to sanitize (object, array, or string)
+ * @return mixed Sanitized data
+ */
+function sanitizeUtf8Recursive($data) {
+    if (is_string($data)) {
+        return sanitizeUtf8String($data);
+    } elseif (is_array($data)) {
+        foreach ($data as $key => $value) {
+            $data[$key] = sanitizeUtf8Recursive($value);
+        }
+        return $data;
+    } elseif (is_object($data)) {
+        foreach ($data as $key => $value) {
+            $data->$key = sanitizeUtf8Recursive($value);
+        }
+        return $data;
+    }
+    return $data;
+}
diff --git a/organizer/src/tests/ThreadUtilsUtf8Test.php b/organizer/src/tests/ThreadUtilsUtf8Test.php
new file mode 100644
index 00000000..9bc18d1c
--- /dev/null
+++ b/organizer/src/tests/ThreadUtilsUtf8Test.php
@@ -0,0 +1,99 @@
+<?php
+
+require_once __DIR__ . '/bootstrap.php';
+require_once __DIR__ . '/../class/ThreadUtils.php';
+
+class ThreadUtilsUtf8Test extends PHPUnit\Framework\TestCase {
+    
+    public function testSanitizeUtf8String_ValidUtf8() {
+        // Valid UTF-8 string should remain unchanged
+        $validUtf8 = "Hello World! Ñoño café";
+        $result = sanitizeUtf8String($validUtf8);
+        $this->assertEquals($validUtf8, $result);
+    }
+    
+    public function testSanitizeUtf8String_InvalidSequence() {
+        // Invalid UTF-8 sequence: 0xC3 followed by space (0x20) instead of continuation byte
+        // This is the exact error from the issue: "invalid byte sequence for encoding "UTF8": 0xc3 0x20"
+        $invalidUtf8 = "Test" . chr(0xC3) . chr(0x20) . "string";
+        $result = sanitizeUtf8String($invalidUtf8);
+        
+        // The result should be a valid UTF-8 string
+        $this->assertNotFalse(mb_check_encoding($result, 'UTF-8'));
+        // The invalid sequence should have been replaced or removed
+        $this->assertNotEquals($invalidUtf8, $result);
+    }
+    
+    public function testSanitizeUtf8String_MultipleInvalidSequences() {
+        // Multiple invalid UTF-8 sequences
+        $invalidUtf8 = chr(0xC3) . chr(0x20) . "test" . chr(0xFF) . chr(0xFE);
+        $result = sanitizeUtf8String($invalidUtf8);
+        
+        // The result should be a valid UTF-8 string
+        $this->assertNotFalse(mb_check_encoding($result, 'UTF-8'));
+    }
+    
+    public function testSanitizeUtf8String_EmptyString() {
+        $result = sanitizeUtf8String("");
+        $this->assertEquals("", $result);
+    }
+    
+    public function testSanitizeUtf8Recursive_Array() {
+        $data = [
+            'valid' => 'Hello',
+            'invalid' => "Test" . chr(0xC3) . chr(0x20) . "string"
+        ];
+        
+        $result = sanitizeUtf8Recursive($data);
+        
+        // Valid string should remain unchanged
+        $this->assertEquals('Hello', $result['valid']);
+        // Invalid string should be sanitized
+        $this->assertNotFalse(mb_check_encoding($result['invalid'], 'UTF-8'));
+        $this->assertNotEquals($data['invalid'], $result['invalid']);
+    }
+    
+    public function testSanitizeUtf8Recursive_Object() {
+        $data = new stdClass();
+        $data->valid = 'Hello';
+        $data->invalid = "Test" . chr(0xC3) . chr(0x20) . "string";
+        $data->nested = new stdClass();
+        $data->nested->value = "Nested" . chr(0xFF);
+        
+        $result = sanitizeUtf8Recursive($data);
+        
+        // Valid string should remain unchanged
+        $this->assertEquals('Hello', $result->valid);
+        // Invalid strings should be sanitized
+        $this->assertNotFalse(mb_check_encoding($result->invalid, 'UTF-8'));
+        $this->assertNotFalse(mb_check_encoding($result->nested->value, 'UTF-8'));
+    }
+    
+    public function testSanitizeUtf8Recursive_MixedTypes() {
+        $data = [
+            'string' => "Test" . chr(0xC3) . chr(0x20),
+            'number' => 42,
+            'boolean' => true,
+            'null' => null,
+            'array' => ['nested' => "Invalid" . chr(0xFF)]
+        ];
+        
+        $result = sanitizeUtf8Recursive($data);
+        
+        // Non-string types should remain unchanged
+        $this->assertEquals(42, $result['number']);
+        $this->assertEquals(true, $result['boolean']);
+        $this->assertEquals(null, $result['null']);
+        // Strings should be sanitized
+        $this->assertNotFalse(mb_check_encoding($result['string'], 'UTF-8'));
+        $this->assertNotFalse(mb_check_encoding($result['array']['nested'], 'UTF-8'));
+    }
+    
+    public function testSanitizeUtf8String_NorwegianCharacters() {
+        // Test with Norwegian characters (should remain valid)
+        $norwegianText = "Snåsa kommune - Innsyn i håndskrevet opptellingsdata";
+        $result = sanitizeUtf8String($norwegianText);
+        $this->assertEquals($norwegianText, $result);
+        $this->assertNotFalse(mb_check_encoding($result, 'UTF-8'));
+    }
+}

From c9d7d26d59e066e689ec0d02b0e2ffc423c26b97 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 23 Jan 2026 14:12:19 +0000
Subject: [PATCH 3/3] Address code review feedback: improve UTF-8 sanitization
 logic

Co-authored-by: HNygard <168380+HNygard@users.noreply.github.com>
---
 organizer/src/class/ThreadUtils.php | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/organizer/src/class/ThreadUtils.php b/organizer/src/class/ThreadUtils.php
index 4505668e..0ed284c8 100644
--- a/organizer/src/class/ThreadUtils.php
+++ b/organizer/src/class/ThreadUtils.php
@@ -295,16 +295,22 @@ function getEmailCcAddressesFromImapHeaders($imapHeaders) {
  * @return string Sanitized UTF-8 text
  */
 function sanitizeUtf8String(string $text): string {
+    // Handle empty strings early
+    if ($text === '') {
+        return '';
+    }
+    
     // mb_convert_encoding with 'UTF-8' to 'UTF-8' replaces invalid sequences
-    // The //IGNORE flag would skip invalid sequences, but we want to use the replacement character
     $sanitized = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
     
-    // If mb_convert_encoding fails, try iconv with TRANSLIT to replace problematic characters
-    if ($sanitized === false || $sanitized === '') {
-        $sanitized = iconv('UTF-8', 'UTF-8//IGNORE', $text);
+    // If mb_convert_encoding fails, try iconv with SUBSTITUTE to replace problematic characters
+    if ($sanitized === false) {
+        $sanitized = @iconv('UTF-8', 'UTF-8//IGNORE', $text);
         if ($sanitized === false) {
-            // Last resort: use regex to remove invalid UTF-8 sequences
-            $sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F]/u', '?', $text);
+            // Last resort: manually filter out invalid bytes
+            // Remove control characters and other problematic bytes
+            // Don't use 'u' modifier since we're dealing with potentially invalid UTF-8
+            $sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/', '', $text);
             if ($sanitized === null) {
                 // If even regex fails, return empty string
                 return '';