mirror of
https://github.com/songoda/SongodaCore.git
synced 2024-11-27 12:35:12 +01:00
Fix charset not being detected correctly on UTF-8 files (SD-7843)
With this patch, the tested bytes are whole words/lines. This ensures that multi-byte characters (like in UTF-8) are not detected as malformed input and `ISO-8859-1` being detected because it just doesn't care about anything and replaces everything it doesn't know with something it does... Why no error? Would like to know that too :/ This *should* not be able to break existig files or plugins.
This commit is contained in:
parent
21f8487988
commit
f545dcc2d9
@ -6,9 +6,10 @@ import java.io.BufferedInputStream;
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.charset.CharacterCodingException;
|
import java.nio.CharBuffer;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
|
import java.nio.charset.CodingErrorAction;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -16,6 +17,23 @@ import java.util.List;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class TextUtils {
|
public class TextUtils {
|
||||||
|
private static final List<Charset> supportedCharsets = new ArrayList<>();
|
||||||
|
|
||||||
|
static {
|
||||||
|
supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
|
||||||
|
supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
|
||||||
|
// supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
|
||||||
|
// supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
|
||||||
|
// supportedCharsets.add(StandardCharsets.UTF_16);
|
||||||
|
|
||||||
|
try {
|
||||||
|
supportedCharsets.add(Charset.forName("windows-1253"));
|
||||||
|
supportedCharsets.add(Charset.forName("ISO-8859-7"));
|
||||||
|
} catch (Exception ignore) { // UnsupportedCharsetException technically can be thrown, but can also be ignored
|
||||||
|
}
|
||||||
|
|
||||||
|
supportedCharsets.add(StandardCharsets.US_ASCII);
|
||||||
|
}
|
||||||
|
|
||||||
public static String formatText(String text) {
|
public static String formatText(String text) {
|
||||||
return formatText(text, false);
|
return formatText(text, false);
|
||||||
@ -110,81 +128,78 @@ public class TextUtils {
|
|||||||
return s.replaceAll(ChatColor.COLOR_CHAR + ";" + ChatColor.COLOR_CHAR + "|" + ChatColor.COLOR_CHAR, "");
|
return s.replaceAll(ChatColor.COLOR_CHAR + ";" + ChatColor.COLOR_CHAR + "|" + ChatColor.COLOR_CHAR, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static final List<Charset> supportedCharsets = new ArrayList<>();
|
|
||||||
|
|
||||||
static {
|
|
||||||
supportedCharsets.add(StandardCharsets.UTF_8); // UTF-8 BOM: EF BB BF
|
|
||||||
supportedCharsets.add(StandardCharsets.ISO_8859_1); // also starts with EF BB BF
|
|
||||||
//supportedCharsets.add(StandardCharsets.UTF_16LE); // FF FE
|
|
||||||
//supportedCharsets.add(StandardCharsets.UTF_16BE); // FE FF
|
|
||||||
//supportedCharsets.add(StandardCharsets.UTF_16);
|
|
||||||
try {
|
|
||||||
supportedCharsets.add(Charset.forName("windows-1253"));
|
|
||||||
supportedCharsets.add(Charset.forName("ISO-8859-7"));
|
|
||||||
} catch (Exception ignore) {
|
|
||||||
} // UnsupportedCharsetException technically can be thrown, but can also be ignored
|
|
||||||
supportedCharsets.add(StandardCharsets.US_ASCII);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Charset detectCharset(File f, Charset def) {
|
public static Charset detectCharset(File f, Charset def) {
|
||||||
byte[] buffer = new byte[2048];
|
byte[] buffer = new byte[2048];
|
||||||
int read;
|
int len;
|
||||||
|
|
||||||
// read the first 2kb of the file and test the file's encoding
|
// Read the first 2KiB of the file and test the file's encoding
|
||||||
try (FileInputStream input = new FileInputStream(f)) {
|
try (FileInputStream input = new FileInputStream(f)) {
|
||||||
read = input.read(buffer);
|
len = input.read(buffer);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return read != -1 ? detectCharset(buffer, read, def) : def;
|
return len != -1 ? detectCharset(buffer, len, def) : def;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Charset detectCharset(BufferedInputStream reader, Charset def) {
|
public static Charset detectCharset(BufferedInputStream reader, Charset def) {
|
||||||
byte[] buffer = new byte[2048];
|
byte[] buffer = new byte[2048];
|
||||||
int read;
|
int len;
|
||||||
|
|
||||||
|
// Read the first 2KiB of the file and test the file's encoding
|
||||||
try {
|
try {
|
||||||
reader.mark(2048);
|
reader.mark(2048);
|
||||||
read = reader.read(buffer);
|
len = reader.read(buffer);
|
||||||
reader.reset();
|
reader.reset();
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return read != -1 ? detectCharset(buffer, read, def) : def;
|
|
||||||
|
return len != -1 ? detectCharset(buffer, len, def) : def;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Charset detectCharset(byte[] data, int len, Charset def) {
|
public static Charset detectCharset(byte[] data, int len, Charset def) {
|
||||||
// check the file header
|
// check the file header
|
||||||
if (len > 4) {
|
if (len > 4) {
|
||||||
if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) {
|
if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE) { // FF FE 00 00 is UTF-32LE
|
||||||
return StandardCharsets.UTF_16LE;
|
return StandardCharsets.UTF_16LE;
|
||||||
// FF FE 00 00 is UTF-32LE
|
} else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) { // 00 00 FE FF is UTF-32BE
|
||||||
} else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF) {
|
|
||||||
return StandardCharsets.UTF_16BE;
|
return StandardCharsets.UTF_16BE;
|
||||||
// 00 00 FE FF is UTF-32BE
|
|
||||||
} else if (data[0] == (byte) 0xEF && data[1] == (byte) 0xBB && data[2] == (byte) 0xBF) { // UTF-8 with BOM, same sig as ISO-8859-1
|
} else if (data[0] == (byte) 0xEF && data[1] == (byte) 0xBB && data[2] == (byte) 0xBF) { // UTF-8 with BOM, same sig as ISO-8859-1
|
||||||
return StandardCharsets.UTF_8;
|
return StandardCharsets.UTF_8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// iterate through sets to test, and return the first that is ok
|
// Look for last Whitespace Character and ignore potentially broken words/multi-byte characters
|
||||||
|
int newLen = len;
|
||||||
|
for (; newLen > 0; --newLen) {
|
||||||
|
if (Character.isWhitespace(data[newLen - 1])) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Buffer got too small? => checking whole buffer
|
||||||
|
if (len > 512 && newLen < 512) {
|
||||||
|
newLen = len;
|
||||||
|
}
|
||||||
|
|
||||||
|
ByteBuffer bBuff = ByteBuffer.wrap(data, 0, newLen).asReadOnlyBuffer();
|
||||||
|
|
||||||
|
// Check through a list of charsets and return the first one that could decode the buffer
|
||||||
for (Charset charset : supportedCharsets) {
|
for (Charset charset : supportedCharsets) {
|
||||||
if (charset != null && isCharset(data, len, charset)) {
|
if (charset != null && isCharset(bBuff, charset)) {
|
||||||
return charset;
|
return charset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bBuff.rewind();
|
||||||
}
|
}
|
||||||
|
|
||||||
return def;
|
return def;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isCharset(byte[] data, int len, Charset charset) {
|
public static boolean isCharset(ByteBuffer data, Charset charset) {
|
||||||
try {
|
CharsetDecoder decoder = charset.newDecoder();
|
||||||
CharsetDecoder decoder = charset.newDecoder();
|
decoder.onMalformedInput(CodingErrorAction.REPORT);
|
||||||
decoder.reset();
|
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||||
decoder.decode(ByteBuffer.wrap(data));
|
|
||||||
return true;
|
return decoder.decode(data, CharBuffer.allocate(data.capacity()), true).isUnderflow();
|
||||||
} catch (CharacterCodingException e) {
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user