All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.eb4j.pdic.DictionaryData Maven / Gradle / Ivy

The newest version!
/*
 * PDIC4j, a PDIC dictionary access library.
 * Copyright (C) 2021 Hiroshi Miura.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */

package io.github.eb4j.pdic;

import com.ibm.icu.charset.CharsetICU;
import org.apache.commons.io.FileUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

/**
 * @author wak (Apache-2.0)
 * @author Hiroshi Miura
 */
class DictionaryData {

    private static final int SECTOR_SIZE = 0x200;

    private final RandomAccessFile sourceStream;
    private final File file;
    private final List searchResults = new ArrayList<>();
    private final int start;
    private final int size;
    private final int blockBits;
    private final int nIndex;
    private final int blocksize;
    private final Charset mainCharset = CharsetICU.forNameICU("BOCU-1");
    private final AnalyzeBlock analyze;
    private final IndexCache indexCache;

    private boolean match;
    private int searchmax; // 最大検索件数

    private int bodyPtr;
    private int[] indexPtr;
    private int lastIndex = 0;

    DictionaryData(@NotNull final File file, final int start, final int size, final int nindex, final boolean blockbits,
                   final int blocksize, final int searchMax) throws FileNotFoundException {
        this.file = file;
        this.start = start;
        this.size = size;
        this.nIndex = nindex;
        if (blockbits) {
            blockBits = 4;
        } else {
            blockBits = 2;
        }
        this.blocksize = blocksize;
        this.searchmax = searchMax;
        sourceStream = new RandomAccessFile(file, "r");
        indexCache = new IndexCache(sourceStream, this.start, this.size);
        analyze = new AnalyzeBlock();
    }

    /**
     * インデックス領域を検索.
     *
     * @return index of block
     */
    public int searchIndexBlock(final String word) {
        int min = 0;
        int max = nIndex - 1;

        ByteBuffer buffer = Utils.encodetoByteBuffer(mainCharset, word);
        int limit = buffer.limit();
        byte[] bytes = new byte[limit];
        System.arraycopy(buffer.array(), 0, bytes, 0, limit);
        for (int i = 0; i < 32; i++) {
            if ((max - min) <= 1) {
                return min;
            }
            final int look = (int) (((long) min + max) / 2);
            final int len = indexPtr[look + 1] - indexPtr[look] - blockBits;
            final int comp = indexCache.compare(bytes, 0, bytes.length, indexPtr[look], len);
            if (comp < 0) {
                max = look;
            } else if (comp > 0) {
                min = look;
            } else {
                return look;
            }
        }
        return min;
    }

    /**
     * Read index blocks.
     *
     * @return true when successfully read block, otherwise false.
     */
    public boolean readIndexBlock(@Nullable final File indexcache) throws IOException {
        bodyPtr = start + size; // 本体位置=( index開始位置+インデックスのサイズ)
        return getIndexFromCache(indexcache) || createIndexAndWriteCache(indexcache);
    }

    private boolean getIndexFromCache(@Nullable File indexcache) throws IOException {
        if (indexcache != null && indexcache.isFile()) {
            try (FileInputStream fis = new FileInputStream(indexcache)) {
                byte[] buff = new byte[(nIndex + 1) * 4];
                int readlen = fis.read(buff);
                if (readlen == buff.length) {
                    indexPtr = new int[nIndex + 1];
                    int ptr = 0;
                    for (int i = 0; i <= nIndex; i++) {
                        int b;
                        int dat;
                        b = buff[ptr++];
                        b &= 0xFF;
                        dat = b;
                        b = buff[ptr++];
                        b &= 0xFF;
                        dat |= (b << 8);
                        b = buff[ptr++];
                        b &= 0xFF;
                        dat |= (b << 16);
                        b = buff[ptr++];
                        b &= 0xFF;
                        dat |= (b << 24);
                        indexPtr[i] = dat;
                    }
                    return true;
                }
            }
        }
        return false;
    }

    private boolean createIndexAndWriteCache(@Nullable File indexcache) throws IOException {
        // インデックスの先頭から見出し語のポインタを拾っていく
        final int nindex = nIndex;
        indexPtr =  new int[nindex + 1]; // インデックスポインタの配列確保
        if (indexCache.createIndex(blockBits, nindex, indexPtr)) {
            if (indexcache != null) {
                byte[] buff = new byte[indexPtr.length * 4];
                int p = 0;
                for (int c = 0; c <= nindex; c++) {
                    int data = indexPtr[c];
                    buff[p++] = (byte) (data & 0xFF);
                    data >>= 8;
                    buff[p++] = (byte) (data & 0xFF);
                    data >>= 8;
                    buff[p++] = (byte) (data & 0xFF);
                    data >>= 8;
                    buff[p++] = (byte) (data & 0xFF);
                }
                try (FileOutputStream fos = FileUtils.openOutputStream(indexcache)) {
                    fos.write(buff, 0, buff.length);
                }
            }
            return true;
        }
        indexPtr = null;
        return false;
    }

    /**
     * num個目の見出し語の実体が入っているブロック番号を返す.
     */
    public int getBlockNo(final int num) {
        int blkptr = indexPtr[num] - blockBits;
        lastIndex = num;
        if (blockBits == 4) {
            return indexCache.getInt(blkptr);
        } else {
            return indexCache.getShort(blkptr);
        }
    }

    boolean hasExactMatch() {
        return match;
    }

    public int getSearchMax() {
        return searchmax;
    }

    public void setSearchMax(final int m) {
        searchmax = m;
    }

    // 単語を検索する
    public boolean searchWord(final String word) throws IOException {
        // 検索結果クリア
        int cnt = 0;
        searchResults.clear();

        int ret = searchIndexBlock(word);
        match = false;
        boolean searchret = false;
        while (true) {
            // 最終ブロックは超えない
            if (ret < nIndex) {
                // 該当ブロック読み出し
                int block = getBlockNo(ret++);
                byte[] pblk = readBlockData(block);
                if (pblk != null) {
                    analyze.setBuffer(pblk);
                    analyze.setSearch(word);
                    searchret = analyze.searchWord();
                    // 未発見でEOBの時のみもう一回、回る
                    if (!searchret && analyze.isEob()) {
                        continue;
                    }
                }
            }
            // 基本一回で抜ける
            break;
        }
        if (searchret) {
            // 前方一致するものだけ結果に入れる
            do {
                PdicElement res = analyze.getRecord();
                if (res == null) {
                    break;
                }
                // 完全一致するかチェック
                if (res.getIndexWord().compareTo(word) == 0) {
                    match = true;
                }
                searchResults.add(res);

                cnt++;
                // 取得最大件数超えたら打ち切り
            } while (cnt < searchmax && hasMoreResult(true));
        }
        return searchret;
    }

    List getResult() {
        return searchResults;
    }

    public List getMoreResult() throws IOException {
        searchResults.clear();
        int cnt = 0;
        // 前方一致するものだけ結果に入れる
        while (cnt < searchmax && hasMoreResult(true)) {
            PdicElement res = analyze.getRecord();
            if (res == null) {
                break;
            }
            searchResults.add(res);
            cnt++;
        }
        return searchResults;
    }

    public boolean hasMoreResult(final boolean incrementptr) throws IOException {
        boolean result = analyze.hasMoreResult(incrementptr);
        if (!result) {
            if (analyze.isEob()) {    // EOBなら次のブロック読み出し
                int nextindex = lastIndex + 1;
                // 最終ブロックは超えない
                if (nextindex < nIndex) {
                    int block = getBlockNo(nextindex);

                    // 該当ブロック読み出し
                    byte[] pblk = readBlockData(block);

                    if (pblk != null) {
                        analyze.setBuffer(pblk);
                        result = analyze.hasMoreResult(incrementptr);
                    }
                }
            }
        }
        return result;
    }

    /**
     * データブロックを読み込み.
     *
     * @param blkno block number to seek when read.
     * @return data block read.
     * @throws IOException when read error happended.
     */
    byte[] readBlockData(final int blkno) throws IOException {
        byte[] buff = new byte[SECTOR_SIZE];
        byte[] pbuf = buff;
        sourceStream.seek(bodyPtr + (long) blkno * blocksize);
        if (sourceStream.read(pbuf, 0, SECTOR_SIZE) < 0) {
            return null;
        }

        // length of block.
        int len = ((int) (pbuf[0])) & 0xFF;
        len |= (((int) (pbuf[1])) & 0xFF) << 8;

        // ブロック長判定
        if ((len & 0x8000) != 0) { // 32bit
            len &= 0x7FFF;
        }
        if (len > 0) {
            // ブロック不足分読込
            if (len * blocksize > 0x200) {
                pbuf = new byte[blocksize * len];
                System.arraycopy(buff, 0, pbuf, 0, SECTOR_SIZE);
                if (sourceStream.read(pbuf, SECTOR_SIZE, len * blocksize - SECTOR_SIZE) < 0) {
                    return null;
                }
            }
        } else {
            pbuf = null;
        }
        return pbuf;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy