From f545dcc2d9f69618a05441465b1f453c207382da Mon Sep 17 00:00:00 2001 From: Christian Koop Date: Tue, 6 Apr 2021 14:22:52 +0200 Subject: [PATCH] Fix charset not being detected correctly on UTF-8 files (SD-7843) With this patch, the tested bytes are whole words/lines. This ensures that multi-byte characters (like in UTF-8) are not detected as malformed input and `ISO-8859-1` being detected because it just doesn't care about anything and replaces everything it doesn't know with something it does... Why no error? Would like to know that too :/ This *should* not be able to break existig files or plugins. --- .../com/songoda/core/utils/TextUtils.java | 95 +++++++++++-------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/Core/src/main/java/com/songoda/core/utils/TextUtils.java b/Core/src/main/java/com/songoda/core/utils/TextUtils.java index 50e3032d..71252554 100644 --- a/Core/src/main/java/com/songoda/core/utils/TextUtils.java +++ b/Core/src/main/java/com/songoda/core/utils/TextUtils.java @@ -6,9 +6,10 @@ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.nio.ByteBuffer; -import java.nio.charset.CharacterCodingException; +import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; @@ -16,6 +17,23 @@ import java.util.List; import java.util.stream.Collectors; public class TextUtils { + private static final List supportedCharsets = new ArrayList<>(); + + static { + supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF + supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF +// supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE +// supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF +// supportedCharsets.add(StandardCharsets.UTF_16); + + try { + supportedCharsets.add(Charset.forName("windows-1253")); + supportedCharsets.add(Charset.forName("ISO-8859-7")); + } catch (Exception ignore) { // UnsupportedCharsetException technically can be thrown, but can also be ignored + } + + supportedCharsets.add(StandardCharsets.US_ASCII); + } public static String formatText(String text) { return formatText(text, false); @@ -110,81 +128,78 @@ public class TextUtils { return s.replaceAll(ChatColor.COLOR_CHAR + ";" + ChatColor.COLOR_CHAR + "|" + ChatColor.COLOR_CHAR, ""); } - protected static final List supportedCharsets = new ArrayList<>(); - - static { - supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF - supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF - //supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE - //supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF - //supportedCharsets.add(StandardCharsets.UTF_16); - try { - supportedCharsets.add(Charset.forName("windows-1253")); - supportedCharsets.add(Charset.forName("ISO-8859-7")); - } catch (Exception ignore) { - } // UnsupportedCharsetException technically can be thrown, but can also be ignored - supportedCharsets.add(StandardCharsets.US_ASCII); - } - public static Charset detectCharset(File f, Charset def) { byte[] buffer = new byte[2048]; - int read; + int len; - // read the first 2kb of the file and test the file's encoding + // Read the first 2KiB of the file and test the file's encoding try (FileInputStream input = new FileInputStream(f)) { - read = input.read(buffer); + len = input.read(buffer); } catch (Exception ex) { return null; } - return read != -1 ? detectCharset(buffer, read, def) : def; + return len != -1 ? detectCharset(buffer, len, def) : def; } public static Charset detectCharset(BufferedInputStream reader, Charset def) { byte[] buffer = new byte[2048]; - int read; + int len; + + // Read the first 2KiB of the file and test the file's encoding try { reader.mark(2048); - read = reader.read(buffer); + len = reader.read(buffer); reader.reset(); } catch (Exception ex) { return null; } - return read != -1 ? detectCharset(buffer, read, def) : def; + + return len != -1 ? detectCharset(buffer, len, def) : def; } public static Charset detectCharset(byte[] data, int len, Charset def) { // check the file header if (len > 4) { - if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) { + if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) { // FF FE 00 00 is UTF-32LE return StandardCharsets.UTF_16LE; - // FF FE 00 00 is UTF-32LE - } else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) { + } else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) { // 00 00 FE FF is UTF-32BE return StandardCharsets.UTF_16BE; - // 00 00 FE FF is UTF-32BE } else if (data[0] == (byte) 0xEF && data[1] == (byte) 0xBB && data[2] == (byte) 0xBF) { // UTF-8 with BOM, same sig as ISO-8859-1 return StandardCharsets.UTF_8; } } - // iterate through sets to test, and return the first that is ok + // Look for last Whitespace Character and ignore potentially broken words/multi-byte characters + int newLen = len; + for (; newLen > 0; --newLen) { + if (Character.isWhitespace(data[newLen - 1])) break; + } + + // Buffer got too small? => checking whole buffer + if (len > 512 && newLen < 512) { + newLen = len; + } + + ByteBuffer bBuff = ByteBuffer.wrap(data, 0, newLen).asReadOnlyBuffer(); + + // Check through a list of charsets and return the first one that could decode the buffer for (Charset charset : supportedCharsets) { - if (charset != null && isCharset(data, len, charset)) { + if (charset != null && isCharset(bBuff, charset)) { return charset; } + + bBuff.rewind(); } return def; } - public static boolean isCharset(byte[] data, int len, Charset charset) { - try { - CharsetDecoder decoder = charset.newDecoder(); - decoder.reset(); - decoder.decode(ByteBuffer.wrap(data)); - return true; - } catch (CharacterCodingException e) { - } - return false; + public static boolean isCharset(ByteBuffer data, Charset charset) { + CharsetDecoder decoder = charset.newDecoder(); + decoder.onMalformedInput(CodingErrorAction.REPORT); + decoder.onUnmappableCharacter(CodingErrorAction.REPORT); + + return decoder.decode(data, CharBuffer.allocate(data.capacity()), true).isUnderflow(); } -} +} \ No newline at end of file