From f399fed405b507c9e1aa7c33eb7a12fca43437a5 Mon Sep 17 00:00:00 2001 From: Filip Hejsek Date: Tue, 3 Mar 2026 19:31:18 +0100 Subject: [PATCH] Detect valid UTF-8 as UTF-8 --- libaegisub/common/charset.cpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/libaegisub/common/charset.cpp b/libaegisub/common/charset.cpp index 2d074870ae..4156254c1f 100644 --- a/libaegisub/common/charset.cpp +++ b/libaegisub/common/charset.cpp @@ -15,10 +15,15 @@ #include "libaegisub/charset.h" #include "libaegisub/file_mapping.h" +#include "libaegisub/log.h" #include "libaegisub/scoped_ptr.h" +#include +#include + #ifdef WITH_UCHARDET #include +#include #endif namespace agi::charset { @@ -51,6 +56,15 @@ std::string Detect(agi::fs::path const& file) { #ifdef WITH_UCHARDET agi::scoped_holder ud(uchardet_new(), uchardet_delete); + + UErrorCode utf8Status = U_ZERO_ERROR; + std::unique_ptr conv = {ucnv_open("UTF-8", &utf8Status), ucnv_close}; + if (conv) + ucnv_setToUCallBack(conv.get(), UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, &utf8Status); + if (utf8Status != U_ZERO_ERROR) + LOG_W("charset/detect") << "Unexpected ICU error: " << u_errorName(utf8Status); + std::array convBuffer; + for (uint64_t offset = 0; offset < fp.size(); ) { auto read = std::min(4096, fp.size() - offset); auto buf = fp.read(offset, read); @@ -58,6 +72,21 @@ std::string Detect(agi::fs::path const& file) { offset += read; + const char *source = buf; + const char *sourceLimit = source + read; + bool flush = offset >= fp.size(); + while (U_SUCCESS(utf8Status)) { + UChar *target = convBuffer.data(); + UChar *targetLimit = target + convBuffer.size(); + ucnv_toUnicode(conv.get(), &target, targetLimit, &source, sourceLimit, nullptr, flush, &utf8Status); + if (utf8Status == U_BUFFER_OVERFLOW_ERROR) { + // result didn't fit in target buffer, try again + utf8Status = U_ZERO_ERROR; + } else if (source == sourceLimit) { + break; + } + } + // A dumb heuristic to detect binary files for (size_t i = 0; i < read; ++i) { if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t')) @@ -67,6 +96,9 @@ std::string Detect(agi::fs::path const& file) { if (binaryish > offset / 8) return "binary"; } + LOG_D("charset/detect") << "UTF-8 detection result: " << u_errorName(utf8Status); + if (U_SUCCESS(utf8Status)) + return "utf-8"; uchardet_data_end(ud); return uchardet_get_charset(ud); #else