All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.eb4j.mdict.MDictDictionary Maven / Gradle / Ivy

The newest version!
/*
 * MD4J, a parser library for MDict format.
 * Copyright (C) 2021,2022 Hiroshi Miura.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */

package io.github.eb4j.mdict;

import io.github.eb4j.mdict.cache.SimpleLRUCache;
import io.github.eb4j.mdict.io.MDFileInputStream;
import io.github.eb4j.mdict.io.MDInputStream;
import io.github.eb4j.mdict.io.MDictUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.zip.DataFormatException;
import org.bouncycastle.util.encoders.Hex;

/**
 * @author Hiroshi Miura
 */
public final class MDictDictionary {
    private final MDFileInputStream mdInputStream;
    private final DictionaryData dictionaryData;
    private final RecordIndex recordIndex;
    private final SimpleLRUCache textCache;

    private final String mdxVersion;
    private final String title;
    private final Charset encoding;
    private final String creationDate;
    private final String format;
    private final String description;
    private final String styleSheet;
    private final Boolean headerEncrypted;
    private final Boolean indexEncrypted;
    private final String keyCaseSensitive;
    private final boolean mdx;

    /**
     * Constructor.
     *
     * @param info MDictDictionaryInfo object to handle.
     * @param index index object.
     * @param recordIndex record index object.
     * @param mdInputStream mdx or mdd file.
     * @param mdx true when file is mdx. false when file is mdd.
     */
    private MDictDictionary(
            final MDictDictionaryInfo info,
            final DictionaryData index,
            final RecordIndex recordIndex,
            final MDFileInputStream mdInputStream,
            final boolean mdx) {
        dictionaryData = index;
        this.recordIndex = recordIndex;
        this.mdInputStream = mdInputStream;
        //
        mdxVersion = info.getRequiredEngineVersion();
        title = info.getTitle();
        String encodingName = info.getEncoding();
        if (encodingName.equalsIgnoreCase("UTF-16")) {
            encodingName = "UTF-16LE";
        }
        encoding = Charset.forName(encodingName);
        creationDate = info.getCreationDate();
        format = info.getFormat();
        description = info.getDescription();
        styleSheet = info.getStyleSheet();
        headerEncrypted = MDictUtils.isHeaderEncrypted(info);
        indexEncrypted = MDictUtils.isIndexEncrypted(info);
        keyCaseSensitive = info.getKeyCaseSensitive();
        this.mdx = mdx;
        //
        textCache = new SimpleLRUCache<>(64, 1_000);
    }

    /**
     * Getter of MDX file version.
     * @return version string.
     */
    public String getMdxVersion() {
        return mdxVersion;
    }

    public Charset getEncoding() {
        return encoding;
    }

    public String getTitle() {
        return title;
    }

    public String getCreationDate() {
        return creationDate;
    }

    public String getFormat() {
        return format;
    }

    public String getDescription() {
        return description;
    }

    public boolean isHeaderEncrypted() {
        return headerEncrypted;
    }

    public boolean isIndexEncrypted() {
        return indexEncrypted;
    }

    public boolean isMdx() {
        return mdx;
    }

    public boolean isKeyCaseSensitive() {
        return "Yes".equals(keyCaseSensitive) || "true".equals(keyCaseSensitive);
    }

    public String getStyleSheet() {
        return styleSheet;
    }

    /**
     * read articles from dictionary with predictive(prefix) search.
     * 

* It read articles with prefix search. * If you looks for word prefix "happ" then you may find words like * "happy", "happiness", and "happily". *

* @param word query word * @return List of entries of word to article map. * @throws MDException when dictionary corrupted or unknown error. */ public List> readArticlesPredictive(final String word) throws MDException { if (!mdx) { throw new MDException("Can not retrieve text data from MDD file."); } List> result = new ArrayList<>(); for (Map.Entry entry : getEntriesPredictive(word)) { addEntry(result, entry); } return result; } /** * read article from dictionary with exact match search. *

* Its results depends indexed words in dictionary data. *

* @param word query word. * @return list of entries of word to article map. * @throws MDException if dictionary corrupted. */ public List> readArticles(final String word) throws MDException { if (!mdx) { throw new MDException("Can not retrieve text data from MDD file."); } List> result = new ArrayList<>(); for (Map.Entry entry : getEntries(word)) { addEntry(result, entry); } return result; } public byte[] readData(final String path) throws MDException { for (Map.Entry entry : getEntries(path)) { if (entry.getKey().equals(path)) { Object value = entry.getValue(); if (value instanceof Long) { return getData((Long) value); } } } return null; } public List> getEntries(final String word) { return dictionaryData.lookUp(word); } public List> getEntriesPredictive(final String word) { return dictionaryData.lookUpPredictive(word); } public byte[] getData(final Long offset) throws MDException { int index = recordIndex.searchOffsetIndex(offset); int pos = (int) (offset - recordIndex.getRecordOffsetDecomp(index)); try { mdInputStream.seek(recordIndex.getCompOffset(index)); } catch (IOException e) { throw new MDException("IO error.", e); } long compSize = recordIndex.getRecordCompSize(index); long decompSize = recordIndex.getRecordDecompSize(index); int dataSize; if (recordIndex.getRecordNumEntries() - 1 > index) { dataSize = (int) (recordIndex.getRecordOffsetDecomp(index + 1) - offset); } else { dataSize = (int) (decompSize - pos); } try { byte[] result = new byte[dataSize]; byte[] buf = MDictUtils.decompressBuf(mdInputStream, compSize, decompSize, false); System.arraycopy(buf, pos, result, 0, dataSize); return result; } catch (DataFormatException | IOException e) { throw new MDException("Decompressed data seems incorrect."); } } public String getText(final Long offset) throws MDException { if (!mdx) { throw new MDException("Can not retrieve text data from MDD file."); } // calculate block index and seek it int index = recordIndex.searchOffsetIndex(offset); long skipSize = offset - recordIndex.getRecordOffsetDecomp(index); try { mdInputStream.seek(recordIndex.getCompOffset(index)); } catch (IOException e) { throw new MDException("IO error.", e); } long compSize = recordIndex.getRecordCompSize(index); long decompSize = recordIndex.getRecordDecompSize(index); try (MDInputStream decompressedStream = MDictUtils.decompress(mdInputStream, compSize, decompSize, false)) { long moved = decompressedStream.skip(skipSize); if (moved != skipSize) { throw new MDException("Decompressed data seems incorrect."); } try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(decompressedStream, encoding), (int) decompSize)) { return readLine(bufferedReader); } } catch (DataFormatException | IOException e) { throw new MDException("data decompression error.", e); } } private String readLine(final BufferedReader buff) throws IOException { int c = buff.read(); if (c == -1) { return null; } StringBuilder builder = new StringBuilder(); while (c != -1 && c != 0) { builder.append((char) c); c = buff.read(); } return builder.toString(); } private void addEntry(final List> result, final Map.Entry entry) { if (entry.getValue() instanceof Long) { addEntry(result, entry.getKey(), (Long) entry.getValue()); } else { Long[] values = (Long[]) entry.getValue(); for (Long offset : values) { addEntry(result, entry.getKey(), offset); } } } private void addEntry(final List> result, final String key, final Long offset) { String value = textCache.get(offset); if (value == null) { try { value = getText(offset); } catch (MDException ignored) { } if (value == null) { return; } } result.add(new AbstractMap.SimpleEntry<>(key, value)); } private static String getBaseName(final String path) { String f = path; if (f.endsWith(".mdx")) { f = f.substring(0, f.length() - ".mdx".length()); } return f; } /** * Dictionary loader. *

* entry point of MDict4j dictionary loader. * It constructs MDictDictionary object. * * @param mdxFile File path of MDX file. * @return MDictDictionary object. * @throws MDException when something goes wrong. */ public static MDictDictionary loadDictionary(final String mdxFile) throws MDException { File file = new File(mdxFile); if (!file.isFile()) { throw new MDException("Target file is not MDict file."); } byte[] keyword = loadDictionaryKey(mdxFile); MDFileInputStream mdxInputStream; MDictDictionaryInfo info; DictionaryData index; RecordIndex record; try { mdxInputStream = new MDFileInputStream(mdxFile); MDictParser parser = MDictParser.createMDXParser(mdxInputStream); info = parser.parseHeader(); index = parser.parseIndex(keyword); record = parser.parseRecordBlock(); } catch (IOException | DataFormatException e) { throw new MDException("Dictionary data read error", e); } return new MDictDictionary(info, index, record, mdxInputStream, true); } /** * Dictionary data loader. *

* entry point of MDict4j Data loader. * It constructs MDictDictionary object from MDD file. *

* @param mdxFile MDX file. * @return MDictDictionary object. * @throws MDException when something goes wrong. * @throws IOException when MDX file doesn't exist. */ public static MDictDictionary loadDictionaryData(final String mdxFile) throws MDException, IOException { File file = new File(mdxFile); if (!file.isFile()) { throw new MDException("Target file is not MDict file."); } String dictName = getBaseName(mdxFile); byte[] keyword = loadDictionaryKey(mdxFile); File mddFile = new File(dictName + ".mdd"); MDFileInputStream mddInputStream; MDictDictionaryInfo info; DictionaryData index; RecordIndex record; try { mddInputStream = new MDFileInputStream(mddFile.getAbsolutePath()); MDictParser parser = MDictParser.createMDDParser(mddInputStream); info = parser.parseHeader(); // force encoding to UTF-16 info.setEncoding("UTF-16LE"); index = parser.parseIndex(keyword); record = parser.parseRecordBlock(); } catch (DataFormatException e) { throw new MDException("Dictionary data read error", e); } return new MDictDictionary(info, index, record, mddInputStream, false); } /** * parse dictionary.key file and return 128-bit regcode. * @param mdxFile dictionary file path. * @return byte[] password data, or null when error occurred */ private static byte[] loadDictionaryKey(final String mdxFile) { String dictName = getBaseName(mdxFile); Path keyFile = Paths.get(dictName + ".key"); if (!keyFile.toFile().isFile() || !keyFile.toFile().canRead()) { return null; } try { Optional first = Files.readAllLines(keyFile, StandardCharsets.UTF_8).stream().findFirst(); if (first.isPresent()) { byte[] keydata = new byte[16]; byte[] temp = Hex.decode(first.get().substring(0, 32)); System.arraycopy(temp, 0, keydata, 0, 16); return keydata; } } catch (IOException ignore) { } return null; } }