ua.net.nlp.other.clean.EncodingModule.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
package ua.net.nlp.other.clean
import java.nio.charset.StandardCharsets
import groovy.transform.CompileStatic
import groovy.transform.PackageScope
@PackageScope
@CompileStatic
class EncodingModule {
private static final String UTF8 = StandardCharsets.UTF_8.name()
OutputTrait out
String getText(File file) {
String text = null
if( file.length() > 10 ) {
FileInputStream fis = new FileInputStream(file)
byte[] bytes = new byte[1024]
fis.read(bytes);
fis.close()
if( file.length() > 100 ) {
if( file.bytes[0..3] == [0x50, 0x4B, 0x03, 0x04] ) {
out.println "\tERROR: found zip file, perhaps it's a Word document?"
return null
}
if( new String(bytes).startsWith("{\\rtf") ) {
out.println "\tERROR: found \"{\\rtf\", perhaps it's an RTF document?"
return null
}
}
if( ! isUTF8(bytes) ) {
out.println "\tNOTE: file is not in UTF-8 encoding"
text = file.getText(UTF8)
text = fixEncoding(text, file)
if( text == null )
return null
}
}
if( text == null ) {
text = file.getText(UTF8)
}
text
}
String tryEncoding(File file, String encoding) {
def cp1251Text = file.getText(encoding)
if( cp1251Text =~ /(?iu)[сц]ьк|ння|від|[іи]й|ої|ти| [ійвузао] | н[еі] | що / ) {
return cp1251Text
}
return null
}
@CompileStatic
String fixEncoding(String text, File file) {
if( text.contains("\u008D\u00C3") ) { // completely broken encoding for «ій»
out.println "\tWARNING: nonfixable broken encoding found, garbage will be left in!"
return null
}
if( text.contains("éîãî") ) {
out.println "\tWARNING: broken encoding"
// some text (esp. converted from pdf) have broken encoding in some lines and good one in others
int convertedLines = 0
int goodLines = 0
text = text.split(/\n/).collect { String line->
if( line.trim() && ! (line =~ /(?iu)[а-яіїєґ]/) ) {
line = new String(line.getBytes("cp1252"), "cp1251")
convertedLines += 1
}
else {
goodLines += 1
}
line
}
.join('\n')
if( text.contains("éîãî") ) {
out.println "\tERROR: still broken: encoding mixed with good one"
return null
}
// text = text.replaceAll(/([бвгґдзклмнпрстфхцшщ])\?([єїюя])/, '$1\'$2')
out.println "\tEncoding fixed (good lines: $goodLines, convertedLines: $convertedLines, text: " + CleanTextCore.getSample(text)
}
else {
["cp1251", "utf-16"].each { encoding ->
String decodedText = tryEncoding(file, encoding)
if( decodedText ) {
out.println "\tencoding found: $encoding"
text = decodedText
if( text.size() < 10 ) {
out.println "\tFile size < 10 chars, probaby $encoding conversion didn't work, skipping"
return null
}
out.println "\tEncoding converted: " + CleanTextCore.getSample(text)
}
}
}
if( text.contains("\uFFFD") ) {
out.println "\tERROR: File contains Unicode 'REPLACEMENT CHARACTER' (U+FFFD)"
return null
}
return text
}
@CompileStatic
private static boolean isUTF8(byte[] pText) {
int expectedLength = 0;
for (int i = 0; i < pText.length && i < 300; i++) {
if ((pText[i] & 0b10000000) == 0b00000000) {
expectedLength = 1;
} else if ((pText[i] & 0b11100000) == 0b11000000) {
expectedLength = 2;
} else if ((pText[i] & 0b11110000) == 0b11100000) {
expectedLength = 3;
} else if ((pText[i] & 0b11111000) == 0b11110000) {
expectedLength = 4;
} else if ((pText[i] & 0b11111100) == 0b11111000) {
expectedLength = 5;
} else if ((pText[i] & 0b11111110) == 0b11111100) {
expectedLength = 6;
} else {
return false;
}
while (--expectedLength > 0) {
if (++i >= pText.length) {
return false;
}
if ((pText[i] & 0b11000000) != 0b10000000) {
return false;
}
}
}
return true;
}
}