Fix charset not being detected correctly on UTF-8 files (SD-7843)

With this patch, the tested bytes are whole words/lines. This ensures that multi-byte characters (like in UTF-8) are not detected as malformed input and `ISO-8859-1` being detected because it just doesn't care about anything and replaces everything it doesn't know with something it does... Why no error? Would like to know that too :/ This *should* not be able to break existig files or plugins.
2024-11-27 12:35:12 +01:00 · 2021-04-06 14:22:52 +02:00 · 2021-04-06 14:22:52 +02:00 · f545dcc2d9
commit f545dcc2d9
parent 21f8487988
1 changed files with 55 additions and 40 deletions
--- a/Core/src/main/java/com/songoda/core/utils/TextUtils.java
+++ b/Core/src/main/java/com/songoda/core/utils/TextUtils.java
@ -6,9 +6,10 @@ import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
+import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@ -16,6 +17,23 @@ import java.util.List;
 import java.util.stream.Collectors;

 public class TextUtils {
+    private static final List<Charset> supportedCharsets = new ArrayList<>();
+
+    static {
+        supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
+        supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
+//        supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
+//        supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
+//        supportedCharsets.add(StandardCharsets.UTF_16);
+
+        try {
+            supportedCharsets.add(Charset.forName("windows-1253"));
+            supportedCharsets.add(Charset.forName("ISO-8859-7"));
+        } catch (Exception ignore) {    // UnsupportedCharsetException technically can be thrown, but can also be ignored
+        }
+
+        supportedCharsets.add(StandardCharsets.US_ASCII);
+    }

    public static String formatText(String text) {
        return formatText(text, false);
@ -110,81 +128,78 @@ public class TextUtils {
        return s.replaceAll(ChatColor.COLOR_CHAR + ";" + ChatColor.COLOR_CHAR + "|" + ChatColor.COLOR_CHAR, "");
    }

-    protected static final List<Charset> supportedCharsets = new ArrayList<>();
-
-    static {
-        supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
-        supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
-        //supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
-        //supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
-        //supportedCharsets.add(StandardCharsets.UTF_16);
-        try {
-            supportedCharsets.add(Charset.forName("windows-1253"));
-            supportedCharsets.add(Charset.forName("ISO-8859-7"));
-        } catch (Exception ignore) {
-        } // UnsupportedCharsetException technically can be thrown, but can also be ignored
-        supportedCharsets.add(StandardCharsets.US_ASCII);
-    }
-
    public static Charset detectCharset(File f, Charset def) {
        byte[] buffer = new byte[2048];
-        int read;
+        int len;

-        // read the first 2kb of the file and test the file's encoding
+        // Read the first 2KiB of the file and test the file's encoding
        try (FileInputStream input = new FileInputStream(f)) {
-            read = input.read(buffer);
+            len = input.read(buffer);
        } catch (Exception ex) {
            return null;
        }

-        return read != -1 ? detectCharset(buffer, read, def) : def;
+        return len != -1 ? detectCharset(buffer, len, def) : def;
    }

    public static Charset detectCharset(BufferedInputStream reader, Charset def) {
        byte[] buffer = new byte[2048];
-        int read;
+        int len;
+
+        // Read the first 2KiB of the file and test the file's encoding
        try {
            reader.mark(2048);
-            read = reader.read(buffer);
+            len = reader.read(buffer);
            reader.reset();
        } catch (Exception ex) {
            return null;
        }
-        return read != -1 ? detectCharset(buffer, read, def) : def;
+
+        return len != -1 ? detectCharset(buffer, len, def) : def;
    }

    public static Charset detectCharset(byte[] data, int len, Charset def) {
        // check the file header
        if (len > 4) {
-            if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) {
+            if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) { // FF FE 00 00 is UTF-32LE
                return StandardCharsets.UTF_16LE;
-                // FF FE 00 00 is UTF-32LE
-            } else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) {
+            } else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) {  // 00 00 FE FF is UTF-32BE
                return StandardCharsets.UTF_16BE;
-                // 00 00 FE FF is UTF-32BE
            } else if (data[0] == (byte) 0xEF && data[1] == (byte) 0xBB && data[2] == (byte) 0xBF) { // UTF-8 with BOM, same sig as ISO-8859-1
                return StandardCharsets.UTF_8;
            }
        }

-        // iterate through sets to test, and return the first that is ok
+        // Look for last Whitespace Character and ignore potentially broken words/multi-byte characters
+        int newLen = len;
+        for (; newLen > 0; --newLen) {
+            if (Character.isWhitespace(data[newLen - 1])) break;
+        }
+
+        // Buffer got too small? => checking whole buffer
+        if (len > 512 && newLen < 512) {
+            newLen = len;
+        }
+
+        ByteBuffer bBuff = ByteBuffer.wrap(data, 0, newLen).asReadOnlyBuffer();
+
+        // Check through a list of charsets and return the first one that could decode the buffer
        for (Charset charset : supportedCharsets) {
-            if (charset != null && isCharset(data, len, charset)) {
+            if (charset != null && isCharset(bBuff, charset)) {
                return charset;
            }
+
+            bBuff.rewind();
        }

        return def;
    }

-    public static boolean isCharset(byte[] data, int len, Charset charset) {
-        try {
+    public static boolean isCharset(ByteBuffer data, Charset charset) {
        CharsetDecoder decoder = charset.newDecoder();
-            decoder.reset();
-            decoder.decode(ByteBuffer.wrap(data));
-            return true;
-        } catch (CharacterCodingException e) {
-        }
-        return false;
+        decoder.onMalformedInput(CodingErrorAction.REPORT);
+        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+
+        return decoder.decode(data, CharBuffer.allocate(data.capacity()), true).isUnderflow();
    }
 }