Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.github.eb4j.mdict.MDictDictionary Maven / Gradle / Ivy
/*
* MD4J, a parser library for MDict format.
* Copyright (C) 2021,2022 Hiroshi Miura.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package io.github.eb4j.mdict;
import io.github.eb4j.mdict.cache.SimpleLRUCache;
import io.github.eb4j.mdict.io.MDFileInputStream;
import io.github.eb4j.mdict.io.MDInputStream;
import io.github.eb4j.mdict.io.MDictUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.zip.DataFormatException;
import org.bouncycastle.util.encoders.Hex;
/**
* @author Hiroshi Miura
*/
public final class MDictDictionary {
private final MDFileInputStream mdInputStream;
private final DictionaryData dictionaryData;
private final RecordIndex recordIndex;
private final SimpleLRUCache textCache;
private final String mdxVersion;
private final String title;
private final Charset encoding;
private final String creationDate;
private final String format;
private final String description;
private final String styleSheet;
private final Boolean headerEncrypted;
private final Boolean indexEncrypted;
private final String keyCaseSensitive;
private final boolean mdx;
/**
* Constructor.
*
* @param info MDictDictionaryInfo object to handle.
* @param index index object.
* @param recordIndex record index object.
* @param mdInputStream mdx or mdd file.
* @param mdx true when file is mdx. false when file is mdd.
*/
private MDictDictionary(
final MDictDictionaryInfo info,
final DictionaryData index,
final RecordIndex recordIndex,
final MDFileInputStream mdInputStream,
final boolean mdx) {
dictionaryData = index;
this.recordIndex = recordIndex;
this.mdInputStream = mdInputStream;
//
mdxVersion = info.getRequiredEngineVersion();
title = info.getTitle();
String encodingName = info.getEncoding();
if (encodingName.equalsIgnoreCase("UTF-16")) {
encodingName = "UTF-16LE";
}
encoding = Charset.forName(encodingName);
creationDate = info.getCreationDate();
format = info.getFormat();
description = info.getDescription();
styleSheet = info.getStyleSheet();
headerEncrypted = MDictUtils.isHeaderEncrypted(info);
indexEncrypted = MDictUtils.isIndexEncrypted(info);
keyCaseSensitive = info.getKeyCaseSensitive();
this.mdx = mdx;
//
textCache = new SimpleLRUCache<>(64, 1_000);
}
/**
* Getter of MDX file version.
* @return version string.
*/
public String getMdxVersion() {
return mdxVersion;
}
public Charset getEncoding() {
return encoding;
}
public String getTitle() {
return title;
}
public String getCreationDate() {
return creationDate;
}
public String getFormat() {
return format;
}
public String getDescription() {
return description;
}
public boolean isHeaderEncrypted() {
return headerEncrypted;
}
public boolean isIndexEncrypted() {
return indexEncrypted;
}
public boolean isMdx() {
return mdx;
}
public boolean isKeyCaseSensitive() {
return "Yes".equals(keyCaseSensitive) || "true".equals(keyCaseSensitive);
}
public String getStyleSheet() {
return styleSheet;
}
/**
* read articles from dictionary with predictive(prefix) search.
*
* It read articles with prefix search.
* If you looks for word prefix "happ" then you may find words like
* "happy", "happiness", and "happily".
*
* @param word query word
* @return List of entries of word to article map.
* @throws MDException when dictionary corrupted or unknown error.
*/
public List> readArticlesPredictive(final String word) throws MDException {
if (!mdx) {
throw new MDException("Can not retrieve text data from MDD file.");
}
List> result = new ArrayList<>();
for (Map.Entry entry : getEntriesPredictive(word)) {
addEntry(result, entry);
}
return result;
}
/**
* read article from dictionary with exact match search.
*
* Its results depends indexed words in dictionary data.
*
* @param word query word.
* @return list of entries of word to article map.
* @throws MDException if dictionary corrupted.
*/
public List> readArticles(final String word) throws MDException {
if (!mdx) {
throw new MDException("Can not retrieve text data from MDD file.");
}
List> result = new ArrayList<>();
for (Map.Entry entry : getEntries(word)) {
addEntry(result, entry);
}
return result;
}
public byte[] readData(final String path) throws MDException {
for (Map.Entry entry : getEntries(path)) {
if (entry.getKey().equals(path)) {
Object value = entry.getValue();
if (value instanceof Long) {
return getData((Long) value);
}
}
}
return null;
}
public List> getEntries(final String word) {
return dictionaryData.lookUp(word);
}
public List> getEntriesPredictive(final String word) {
return dictionaryData.lookUpPredictive(word);
}
public byte[] getData(final Long offset) throws MDException {
int index = recordIndex.searchOffsetIndex(offset);
int pos = (int) (offset - recordIndex.getRecordOffsetDecomp(index));
try {
mdInputStream.seek(recordIndex.getCompOffset(index));
} catch (IOException e) {
throw new MDException("IO error.", e);
}
long compSize = recordIndex.getRecordCompSize(index);
long decompSize = recordIndex.getRecordDecompSize(index);
int dataSize;
if (recordIndex.getRecordNumEntries() - 1 > index) {
dataSize = (int) (recordIndex.getRecordOffsetDecomp(index + 1) - offset);
} else {
dataSize = (int) (decompSize - pos);
}
try {
byte[] result = new byte[dataSize];
byte[] buf = MDictUtils.decompressBuf(mdInputStream, compSize, decompSize, false);
System.arraycopy(buf, pos, result, 0, dataSize);
return result;
} catch (DataFormatException | IOException e) {
throw new MDException("Decompressed data seems incorrect.");
}
}
public String getText(final Long offset) throws MDException {
if (!mdx) {
throw new MDException("Can not retrieve text data from MDD file.");
}
// calculate block index and seek it
int index = recordIndex.searchOffsetIndex(offset);
long skipSize = offset - recordIndex.getRecordOffsetDecomp(index);
try {
mdInputStream.seek(recordIndex.getCompOffset(index));
} catch (IOException e) {
throw new MDException("IO error.", e);
}
long compSize = recordIndex.getRecordCompSize(index);
long decompSize = recordIndex.getRecordDecompSize(index);
try (MDInputStream decompressedStream = MDictUtils.decompress(mdInputStream, compSize, decompSize, false)) {
long moved = decompressedStream.skip(skipSize);
if (moved != skipSize) {
throw new MDException("Decompressed data seems incorrect.");
}
try (BufferedReader bufferedReader =
new BufferedReader(new InputStreamReader(decompressedStream, encoding), (int) decompSize)) {
return readLine(bufferedReader);
}
} catch (DataFormatException | IOException e) {
throw new MDException("data decompression error.", e);
}
}
private String readLine(final BufferedReader buff) throws IOException {
int c = buff.read();
if (c == -1) {
return null;
}
StringBuilder builder = new StringBuilder();
while (c != -1 && c != 0) {
builder.append((char) c);
c = buff.read();
}
return builder.toString();
}
private void addEntry(final List> result, final Map.Entry entry) {
if (entry.getValue() instanceof Long) {
addEntry(result, entry.getKey(), (Long) entry.getValue());
} else {
Long[] values = (Long[]) entry.getValue();
for (Long offset : values) {
addEntry(result, entry.getKey(), offset);
}
}
}
private void addEntry(final List> result, final String key, final Long offset) {
String value = textCache.get(offset);
if (value == null) {
try {
value = getText(offset);
} catch (MDException ignored) {
}
if (value == null) {
return;
}
}
result.add(new AbstractMap.SimpleEntry<>(key, value));
}
private static String getBaseName(final String path) {
String f = path;
if (f.endsWith(".mdx")) {
f = f.substring(0, f.length() - ".mdx".length());
}
return f;
}
/**
* Dictionary loader.
*
* entry point of MDict4j dictionary loader.
* It constructs MDictDictionary object.
*
* @param mdxFile File path of MDX file.
* @return MDictDictionary object.
* @throws MDException when something goes wrong.
*/
public static MDictDictionary loadDictionary(final String mdxFile) throws MDException {
File file = new File(mdxFile);
if (!file.isFile()) {
throw new MDException("Target file is not MDict file.");
}
byte[] keyword = loadDictionaryKey(mdxFile);
MDFileInputStream mdxInputStream;
MDictDictionaryInfo info;
DictionaryData index;
RecordIndex record;
try {
mdxInputStream = new MDFileInputStream(mdxFile);
MDictParser parser = MDictParser.createMDXParser(mdxInputStream);
info = parser.parseHeader();
index = parser.parseIndex(keyword);
record = parser.parseRecordBlock();
} catch (IOException | DataFormatException e) {
throw new MDException("Dictionary data read error", e);
}
return new MDictDictionary(info, index, record, mdxInputStream, true);
}
/**
* Dictionary data loader.
*
* entry point of MDict4j Data loader.
* It constructs MDictDictionary object from MDD file.
*
* @param mdxFile MDX file.
* @return MDictDictionary object.
* @throws MDException when something goes wrong.
* @throws IOException when MDX file doesn't exist.
*/
public static MDictDictionary loadDictionaryData(final String mdxFile) throws MDException, IOException {
File file = new File(mdxFile);
if (!file.isFile()) {
throw new MDException("Target file is not MDict file.");
}
String dictName = getBaseName(mdxFile);
byte[] keyword = loadDictionaryKey(mdxFile);
File mddFile = new File(dictName + ".mdd");
MDFileInputStream mddInputStream;
MDictDictionaryInfo info;
DictionaryData index;
RecordIndex record;
try {
mddInputStream = new MDFileInputStream(mddFile.getAbsolutePath());
MDictParser parser = MDictParser.createMDDParser(mddInputStream);
info = parser.parseHeader();
// force encoding to UTF-16
info.setEncoding("UTF-16LE");
index = parser.parseIndex(keyword);
record = parser.parseRecordBlock();
} catch (DataFormatException e) {
throw new MDException("Dictionary data read error", e);
}
return new MDictDictionary(info, index, record, mddInputStream, false);
}
/**
* parse dictionary.key file and return 128-bit regcode.
* @param mdxFile dictionary file path.
* @return byte[] password data, or null when error occurred
*/
private static byte[] loadDictionaryKey(final String mdxFile) {
String dictName = getBaseName(mdxFile);
Path keyFile = Paths.get(dictName + ".key");
if (!keyFile.toFile().isFile() || !keyFile.toFile().canRead()) {
return null;
}
try {
Optional first =
Files.readAllLines(keyFile, StandardCharsets.UTF_8).stream().findFirst();
if (first.isPresent()) {
byte[] keydata = new byte[16];
byte[] temp = Hex.decode(first.get().substring(0, 32));
System.arraycopy(temp, 0, keydata, 0, 16);
return keydata;
}
} catch (IOException ignore) {
}
return null;
}
}