Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions organizer/src/class/ThreadEmailDatabaseSaver.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
require_once __DIR__ . '/ThreadFolderManager.php';
require_once __DIR__ . '/ThreadStorageManager.php';
require_once __DIR__ . '/ThreadEmailProcessingErrorManager.php';
require_once __DIR__ . '/ThreadUtils.php';

use Imap\ImapConnection;
use Imap\ImapEmailProcessor;
Expand Down Expand Up @@ -66,6 +67,9 @@ public function saveThreadEmails(string $folder): array {

# Figure out which thread this email is part of
$all_emails = $email->getEmailAddresses($rawEmail);

// Sanitize email addresses to prevent UTF-8 encoding issues
$all_emails = array_map('sanitizeUtf8String', $all_emails);

$email_identifier = date('Y-m-d__His', $email->timestamp) . '__' . md5($email->subject);

Expand Down Expand Up @@ -254,6 +258,9 @@ private function emailExistsInDatabase(string $threadId, string $emailIdOld): bo
* @return string UUID of the saved email
*/
private function saveEmailToDatabase(string $threadId, object $email, string $direction, string $filename, string $rawEmail, stdClass $imap_headers): string {
// Sanitize IMAP headers to ensure all text fields have valid UTF-8
$sanitized_headers = sanitizeUtf8Recursive($imap_headers);

$query = "
INSERT INTO thread_emails (
thread_id,
Expand All @@ -278,8 +285,8 @@ private function saveEmailToDatabase(string $threadId, object $email, string $di
':email_type' => $direction,
':status_type' => ThreadEmailStatusType::UNKNOWN->value,
':status_text' => 'Uklassifisert',
':imap_headers' => json_encode($imap_headers, JSON_UNESCAPED_UNICODE ^ JSON_UNESCAPED_SLASHES),
':id_old' => $filename
':imap_headers' => json_encode($sanitized_headers, JSON_UNESCAPED_UNICODE ^ JSON_UNESCAPED_SLASHES),
':id_old' => sanitizeUtf8String($filename)
];

// Handle binary content separately
Expand Down Expand Up @@ -321,10 +328,10 @@ public function saveAttachmentToDatabase(string $emailId, object $attachment, $c

$params = [
':email_id' => $emailId,
':name' => $attachment->name,
':filename' => $attachment->filename,
':filetype' => $attachment->filetype,
':location' => $attachment->location,
':name' => sanitizeUtf8String($attachment->name),
':filename' => sanitizeUtf8String($attachment->filename),
':filetype' => sanitizeUtf8String($attachment->filetype),
':location' => sanitizeUtf8String($attachment->location),
':status_type' => ThreadEmailStatusType::UNKNOWN->value,
':status_text' => 'uklassifisert-dok'
];
Expand Down Expand Up @@ -377,13 +384,13 @@ private function saveEmailProcessingError(
string $folderName
): void {
ThreadEmailProcessingErrorManager::saveEmailProcessingError(
$emailIdentifier,
$emailSubject,
$emailAddresses,
$errorType,
$errorMessage,
sanitizeUtf8String($emailIdentifier),
sanitizeUtf8String($emailSubject),
sanitizeUtf8String($emailAddresses),
sanitizeUtf8String($errorType),
sanitizeUtf8String($errorMessage),
$suggestedThreadId,
$folderName
sanitizeUtf8String($folderName)
);
}
}
58 changes: 58 additions & 0 deletions organizer/src/class/ThreadUtils.php
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,61 @@ function getEmailCcAddressesFromImapHeaders($imapHeaders) {

return extractAddressesFromEmailObjects($ccObjects);
}

/**
* Sanitize UTF-8 string by replacing invalid byte sequences with replacement character
* This prevents PostgreSQL UTF-8 encoding errors when inserting data from IMAP
*
* @param string $text Text to sanitize
* @return string Sanitized UTF-8 text
*/
function sanitizeUtf8String(string $text): string {
// Handle empty strings early
if ($text === '') {
return '';
}

// mb_convert_encoding with 'UTF-8' to 'UTF-8' replaces invalid sequences
$sanitized = mb_convert_encoding($text, 'UTF-8', 'UTF-8');

// If mb_convert_encoding fails, try iconv with SUBSTITUTE to replace problematic characters
if ($sanitized === false) {
$sanitized = @iconv('UTF-8', 'UTF-8//IGNORE', $text);
if ($sanitized === false) {
// Last resort: manually filter out invalid bytes
// Remove control characters and other problematic bytes
// Don't use 'u' modifier since we're dealing with potentially invalid UTF-8
$sanitized = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/', '', $text);
if ($sanitized === null) {
// If even regex fails, return empty string
return '';
}
}
}

return $sanitized;
}

/**
* Recursively sanitize UTF-8 strings in an object or array
* This ensures all text data is safe for PostgreSQL UTF-8 encoding
*
* @param mixed $data Data to sanitize (object, array, or string)
* @return mixed Sanitized data
*/
function sanitizeUtf8Recursive($data) {
if (is_string($data)) {
return sanitizeUtf8String($data);
} elseif (is_array($data)) {
foreach ($data as $key => $value) {
$data[$key] = sanitizeUtf8Recursive($value);
}
return $data;
} elseif (is_object($data)) {
foreach ($data as $key => $value) {
$data->$key = sanitizeUtf8Recursive($value);
}
return $data;
}
return $data;
}
99 changes: 99 additions & 0 deletions organizer/src/tests/ThreadUtilsUtf8Test.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
<?php

require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/../class/ThreadUtils.php';

class ThreadUtilsUtf8Test extends PHPUnit\Framework\TestCase {

public function testSanitizeUtf8String_ValidUtf8() {
// Valid UTF-8 string should remain unchanged
$validUtf8 = "Hello World! Ñoño café";
$result = sanitizeUtf8String($validUtf8);
$this->assertEquals($validUtf8, $result);
}

public function testSanitizeUtf8String_InvalidSequence() {
// Invalid UTF-8 sequence: 0xC3 followed by space (0x20) instead of continuation byte
// This is the exact error from the issue: "invalid byte sequence for encoding "UTF8": 0xc3 0x20"
$invalidUtf8 = "Test" . chr(0xC3) . chr(0x20) . "string";
$result = sanitizeUtf8String($invalidUtf8);

// The result should be a valid UTF-8 string
$this->assertNotFalse(mb_check_encoding($result, 'UTF-8'));
// The invalid sequence should have been replaced or removed
$this->assertNotEquals($invalidUtf8, $result);
}

public function testSanitizeUtf8String_MultipleInvalidSequences() {
// Multiple invalid UTF-8 sequences
$invalidUtf8 = chr(0xC3) . chr(0x20) . "test" . chr(0xFF) . chr(0xFE);
$result = sanitizeUtf8String($invalidUtf8);

// The result should be a valid UTF-8 string
$this->assertNotFalse(mb_check_encoding($result, 'UTF-8'));
}

public function testSanitizeUtf8String_EmptyString() {
$result = sanitizeUtf8String("");
$this->assertEquals("", $result);
}

public function testSanitizeUtf8Recursive_Array() {
$data = [
'valid' => 'Hello',
'invalid' => "Test" . chr(0xC3) . chr(0x20) . "string"
];

$result = sanitizeUtf8Recursive($data);

// Valid string should remain unchanged
$this->assertEquals('Hello', $result['valid']);
// Invalid string should be sanitized
$this->assertNotFalse(mb_check_encoding($result['invalid'], 'UTF-8'));
$this->assertNotEquals($data['invalid'], $result['invalid']);
}

public function testSanitizeUtf8Recursive_Object() {
$data = new stdClass();
$data->valid = 'Hello';
$data->invalid = "Test" . chr(0xC3) . chr(0x20) . "string";
$data->nested = new stdClass();
$data->nested->value = "Nested" . chr(0xFF);

$result = sanitizeUtf8Recursive($data);

// Valid string should remain unchanged
$this->assertEquals('Hello', $result->valid);
// Invalid strings should be sanitized
$this->assertNotFalse(mb_check_encoding($result->invalid, 'UTF-8'));
$this->assertNotFalse(mb_check_encoding($result->nested->value, 'UTF-8'));
}

public function testSanitizeUtf8Recursive_MixedTypes() {
$data = [
'string' => "Test" . chr(0xC3) . chr(0x20),
'number' => 42,
'boolean' => true,
'null' => null,
'array' => ['nested' => "Invalid" . chr(0xFF)]
];

$result = sanitizeUtf8Recursive($data);

// Non-string types should remain unchanged
$this->assertEquals(42, $result['number']);
$this->assertEquals(true, $result['boolean']);
$this->assertEquals(null, $result['null']);
// Strings should be sanitized
$this->assertNotFalse(mb_check_encoding($result['string'], 'UTF-8'));
$this->assertNotFalse(mb_check_encoding($result['array']['nested'], 'UTF-8'));
}

public function testSanitizeUtf8String_NorwegianCharacters() {
// Test with Norwegian characters (should remain valid)
$norwegianText = "Snåsa kommune - Innsyn i håndskrevet opptellingsdata";
$result = sanitizeUtf8String($norwegianText);
$this->assertEquals($norwegianText, $result);
$this->assertNotFalse(mb_check_encoding($result, 'UTF-8'));
}
}