Fix charset not being detected correctly on UTF-8 files (SD-7843)

With this patch, the tested bytes are whole words/lines. This ensures that multi-byte characters (like in UTF-8) are not detected as malformed input and `ISO-8859-1` being detected because it just doesn't care about anything and replaces everything it doesn't know with something it does... Why no error? Would like to know that too :/

This *should* not be able to break existig files or plugins.
This commit is contained in:
Christian Koop 2021-04-06 14:22:52 +02:00
parent 21f8487988
commit f545dcc2d9
No known key found for this signature in database
GPG Key ID: 89A8181384E010A3

View File

@ -6,9 +6,10 @@ import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
@ -16,6 +17,23 @@ import java.util.List;
import java.util.stream.Collectors;
public class TextUtils {
private static final List<Charset> supportedCharsets = new ArrayList<>();
static {
supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
// supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
// supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
// supportedCharsets.add(StandardCharsets.UTF_16);
try {
supportedCharsets.add(Charset.forName("windows-1253"));
supportedCharsets.add(Charset.forName("ISO-8859-7"));
} catch (Exception ignore) { // UnsupportedCharsetException technically can be thrown, but can also be ignored
}
supportedCharsets.add(StandardCharsets.US_ASCII);
}
public static String formatText(String text) {
return formatText(text, false);
@ -110,81 +128,78 @@ public class TextUtils {
return s.replaceAll(ChatColor.COLOR_CHAR + ";" + ChatColor.COLOR_CHAR + "|" + ChatColor.COLOR_CHAR, "");
}
protected static final List<Charset> supportedCharsets = new ArrayList<>();
static {
supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
//supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
//supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
//supportedCharsets.add(StandardCharsets.UTF_16);
try {
supportedCharsets.add(Charset.forName("windows-1253"));
supportedCharsets.add(Charset.forName("ISO-8859-7"));
} catch (Exception ignore) {
} // UnsupportedCharsetException technically can be thrown, but can also be ignored
supportedCharsets.add(StandardCharsets.US_ASCII);
}
public static Charset detectCharset(File f, Charset def) {
byte[] buffer = new byte[2048];
int read;
int len;
// read the first 2kb of the file and test the file's encoding
// Read the first 2KiB of the file and test the file's encoding
try (FileInputStream input = new FileInputStream(f)) {
read = input.read(buffer);
len = input.read(buffer);
} catch (Exception ex) {
return null;
}
return read != -1 ? detectCharset(buffer, read, def) : def;
return len != -1 ? detectCharset(buffer, len, def) : def;
}
public static Charset detectCharset(BufferedInputStream reader, Charset def) {
byte[] buffer = new byte[2048];
int read;
int len;
// Read the first 2KiB of the file and test the file's encoding
try {
reader.mark(2048);
read = reader.read(buffer);
len = reader.read(buffer);
reader.reset();
} catch (Exception ex) {
return null;
}
return read != -1 ? detectCharset(buffer, read, def) : def;
return len != -1 ? detectCharset(buffer, len, def) : def;
}
public static Charset detectCharset(byte[] data, int len, Charset def) {
// check the file header
if (len > 4) {
if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) {
if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) { // FF FE 00 00 is UTF-32LE
return StandardCharsets.UTF_16LE;
// FF FE 00 00 is UTF-32LE
} else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) {
} else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) { // 00 00 FE FF is UTF-32BE
return StandardCharsets.UTF_16BE;
// 00 00 FE FF is UTF-32BE
} else if (data[0] == (byte) 0xEF && data[1] == (byte) 0xBB && data[2] == (byte) 0xBF) { // UTF-8 with BOM, same sig as ISO-8859-1
return StandardCharsets.UTF_8;
}
}
// iterate through sets to test, and return the first that is ok
// Look for last Whitespace Character and ignore potentially broken words/multi-byte characters
int newLen = len;
for (; newLen > 0; --newLen) {
if (Character.isWhitespace(data[newLen - 1])) break;
}
// Buffer got too small? => checking whole buffer
if (len > 512 && newLen < 512) {
newLen = len;
}
ByteBuffer bBuff = ByteBuffer.wrap(data, 0, newLen).asReadOnlyBuffer();
// Check through a list of charsets and return the first one that could decode the buffer
for (Charset charset : supportedCharsets) {
if (charset != null && isCharset(data, len, charset)) {
if (charset != null && isCharset(bBuff, charset)) {
return charset;
}
bBuff.rewind();
}
return def;
}
public static boolean isCharset(byte[] data, int len, Charset charset) {
try {
public static boolean isCharset(ByteBuffer data, Charset charset) {
CharsetDecoder decoder = charset.newDecoder();
decoder.reset();
decoder.decode(ByteBuffer.wrap(data));
return true;
} catch (CharacterCodingException e) {
}
return false;
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
return decoder.decode(data, CharBuffer.allocate(data.capacity()), true).isUnderflow();
}
}