From f545dcc2d9f69618a05441465b1f453c207382da Mon Sep 17 00:00:00 2001
From: Christian Koop <contact@sprax2013.de>
Date: Tue, 6 Apr 2021 14:22:52 +0200
Subject: [PATCH] Fix charset not being detected correctly on UTF-8 files
 (SD-7843)

With this patch, the tested bytes are whole words/lines. This ensures that multi-byte characters (like in UTF-8) are not detected as malformed input and `ISO-8859-1` being detected because it just doesn't care about anything and replaces everything it doesn't know with something it does... Why no error? Would like to know that too :/

This *should* not be able to break existig files or plugins.
---
 .../com/songoda/core/utils/TextUtils.java     | 95 +++++++++++--------
 1 file changed, 55 insertions(+), 40 deletions(-)
diff --git a/Core/src/main/java/com/songoda/core/utils/TextUtils.java b/Core/src/main/java/com/songoda/core/utils/TextUtils.java
index 50e3032d..71252554 100644
--- a/Core/src/main/java/com/songoda/core/utils/TextUtils.java
+++ b/Core/src/main/java/com/songoda/core/utils/TextUtils.java
@@ -6,9 +6,10 @@ import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
+import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -16,6 +17,23 @@ import java.util.List;
 import java.util.stream.Collectors;
 
 public class TextUtils {
+    private static final List<Charset> supportedCharsets = new ArrayList<>();
+
+    static {
+        supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
+        supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
+//        supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
+//        supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
+//        supportedCharsets.add(StandardCharsets.UTF_16);
+
+        try {
+            supportedCharsets.add(Charset.forName("windows-1253"));
+            supportedCharsets.add(Charset.forName("ISO-8859-7"));
+        } catch (Exception ignore) {    // UnsupportedCharsetException technically can be thrown, but can also be ignored
+        }
+
+        supportedCharsets.add(StandardCharsets.US_ASCII);
+    }
 
     public static String formatText(String text) {
         return formatText(text, false);
@@ -110,81 +128,78 @@ public class TextUtils {
         return s.replaceAll(ChatColor.COLOR_CHAR + ";" + ChatColor.COLOR_CHAR + "|" + ChatColor.COLOR_CHAR, "");
     }
 
-    protected static final List<Charset> supportedCharsets = new ArrayList<>();
-
-    static {
-        supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
-        supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
-        //supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
-        //supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
-        //supportedCharsets.add(StandardCharsets.UTF_16);
-        try {
-            supportedCharsets.add(Charset.forName("windows-1253"));
-            supportedCharsets.add(Charset.forName("ISO-8859-7"));
-        } catch (Exception ignore) {
-        } // UnsupportedCharsetException technically can be thrown, but can also be ignored
-        supportedCharsets.add(StandardCharsets.US_ASCII);
-    }
-
     public static Charset detectCharset(File f, Charset def) {
         byte[] buffer = new byte[2048];
-        int read;
+        int len;
 
-        // read the first 2kb of the file and test the file's encoding
+        // Read the first 2KiB of the file and test the file's encoding
         try (FileInputStream input = new FileInputStream(f)) {
-            read = input.read(buffer);
+            len = input.read(buffer);
         } catch (Exception ex) {
             return null;
         }
 
-        return read != -1 ? detectCharset(buffer, read, def) : def;
+        return len != -1 ? detectCharset(buffer, len, def) : def;
     }
 
     public static Charset detectCharset(BufferedInputStream reader, Charset def) {
         byte[] buffer = new byte[2048];
-        int read;
+        int len;
+
+        // Read the first 2KiB of the file and test the file's encoding
         try {
             reader.mark(2048);
-            read = reader.read(buffer);
+            len = reader.read(buffer);
             reader.reset();
         } catch (Exception ex) {
             return null;
         }
-        return read != -1 ? detectCharset(buffer, read, def) : def;
+
+        return len != -1 ? detectCharset(buffer, len, def) : def;
     }
 
     public static Charset detectCharset(byte[] data, int len, Charset def) {
         // check the file header
         if (len > 4) {
-            if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) {
+            if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) { // FF FE 00 00 is UTF-32LE
                 return StandardCharsets.UTF_16LE;
-                // FF FE 00 00 is UTF-32LE
-            } else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) {
+            } else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) {  // 00 00 FE FF is UTF-32BE
                 return StandardCharsets.UTF_16BE;
-                // 00 00 FE FF is UTF-32BE
             } else if (data[0] == (byte) 0xEF && data[1] == (byte) 0xBB && data[2] == (byte) 0xBF) { // UTF-8 with BOM, same sig as ISO-8859-1
                 return StandardCharsets.UTF_8;
             }
         }
 
-        // iterate through sets to test, and return the first that is ok
+        // Look for last Whitespace Character and ignore potentially broken words/multi-byte characters
+        int newLen = len;
+        for (; newLen > 0; --newLen) {
+            if (Character.isWhitespace(data[newLen - 1])) break;
+        }
+
+        // Buffer got too small? => checking whole buffer
+        if (len > 512 && newLen < 512) {
+            newLen = len;
+        }
+
+        ByteBuffer bBuff = ByteBuffer.wrap(data, 0, newLen).asReadOnlyBuffer();
+
+        // Check through a list of charsets and return the first one that could decode the buffer
         for (Charset charset : supportedCharsets) {
-            if (charset != null && isCharset(data, len, charset)) {
+            if (charset != null && isCharset(bBuff, charset)) {
                 return charset;
             }
+
+            bBuff.rewind();
         }
 
         return def;
     }
 
-    public static boolean isCharset(byte[] data, int len, Charset charset) {
-        try {
-            CharsetDecoder decoder = charset.newDecoder();
-            decoder.reset();
-            decoder.decode(ByteBuffer.wrap(data));
-            return true;
-        } catch (CharacterCodingException e) {
-        }
-        return false;
+    public static boolean isCharset(ByteBuffer data, Charset charset) {
+        CharsetDecoder decoder = charset.newDecoder();
+        decoder.onMalformedInput(CodingErrorAction.REPORT);
+        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+
+        return decoder.decode(data, CharBuffer.allocate(data.capacity()), true).isUnderflow();
     }
-}
+}
\ No newline at end of file