io.github.eb4j.pdic.PdicInfo Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdic4j Show documentation
PDIC access library for java
There is a newer version: 0.3.3
/*
 * PDIC4j, a PDIC dictionary access library.
 * Copyright (C) 2021 Hiroshi Miura.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */

package io.github.eb4j.pdic;

import com.ibm.icu.charset.CharsetICU;
import org.apache.commons.lang3.ArrayUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.WeakHashMap;

/**
 * @author wak (Apache-2.0)
 * @author Hiroshi Miura
 */
@SuppressWarnings("membername")
class PdicInfo {
    protected File m_file;
    protected int m_bodyptr;
    protected List mSearchResult = new ArrayList<>();

    protected int m_start;
    protected int m_size;
    protected int m_blockbits;
    protected int m_nindex;
    protected int m_blocksize;
    protected boolean m_match;
    protected int m_searchmax; // 最大検索件数
    protected String m_dicname; // 辞書名

    protected int[] mIndexPtr;

    protected Charset mMainCharset;
    protected Charset mPhoneCharset;
    protected WeakHashMap mEncodeCache = new WeakHashMap<>();

    protected AnalyzeBlock mAnalyze;
    protected int mLastIndex = 0;
    protected PdicInfoCache mPdicInfoCache;

    private RandomAccessFile mSrcStream = null;

    @SuppressWarnings("avoidinlineconditionals")
    PdicInfo(final File file, final int start, final int size, final int nindex, final boolean blockbits,
             final int blocksize) {
        m_file = file;
        m_start = start;
        m_size = size;
        m_nindex = nindex;
        m_blockbits = (blockbits) ? 4 : 2;
        m_blocksize = blocksize;
        m_searchmax = 10;

        mPhoneCharset = CharsetICU.forNameICU("BOCU-1");
        mMainCharset = CharsetICU.forNameICU("BOCU-1");
        try {
            mSrcStream = new RandomAccessFile(m_file, "r");
            mAnalyze = new AnalyzeBlock();
            mPdicInfoCache = new PdicInfoCache(mSrcStream, m_start, m_size);
        } catch (FileNotFoundException ignored) {
        }
    }

    /**
     * byte配列の本文文字列をCharBufferに変換する.
     */
    static CharBuffer decodetoCharBuffer(final Charset cs, final byte[] array, final int pos, final int len) {
        return cs.decode(ByteBuffer.wrap(array, pos, len));
    }

    /**
     * 本文の文字列をByteBufferに変換する.
     */
    static ByteBuffer encodetoByteBuffer(final Charset cs, final String str) {
        return cs.encode(str);
    }

    /**
     * インデックス領域を検索.
     *
     * @return index of block
     */
    public int searchIndexBlock(final String word) {
        int min = 0;
        int max = m_nindex - 1;

        ByteBuffer buffer = mEncodeCache.get(word);
        if (buffer == null) {
            buffer = encodetoByteBuffer(mMainCharset, word);
            mEncodeCache.put(word, buffer);
        }
        int limit = buffer.limit();
        byte[] bytes = new byte[limit];
        System.arraycopy(buffer.array(), 0, bytes, 0, limit);
        int wordlen = bytes.length;

        int[] indexPtr = mIndexPtr;
        int blockbits = m_blockbits;
        PdicInfoCache pdicInfoCache = mPdicInfoCache;

        for (int i = 0; i < 32; i++) {
            if ((max - min) <= 1) {
                return min;
            }
            final int look = (int) (((long) min + max) / 2);
            final int len = indexPtr[look + 1] - indexPtr[look] - blockbits;
            final int comp = pdicInfoCache.compare(bytes, 0, wordlen, indexPtr[look], len);
            if (comp < 0) {
                max = look;
            } else if (comp > 0) {
                min = look;
            } else {
                return look;
            }
        }
        return min;
    }

    /**
     * Read index blocks.
     *
     * @return true when successfully read block, otherwise false.
     */
    public boolean readIndexBlock(final File indexcache) {
        if (mSrcStream != null) {
            m_bodyptr = m_start + m_size; // 本体位置=( index開始位置＋インデックスのサイズ)
            if (indexcache != null) {
                try (FileInputStream fis = new FileInputStream(indexcache)) {
                    byte[] buff = new byte[(m_nindex + 1) * 4];
                    int readlen = fis.read(buff);
                    if (readlen == buff.length) {
                        final int indexlen = m_nindex;
                        final int[] indexptr = new int[m_nindex + 1];
                        mIndexPtr = indexptr;
                        int ptr = 0;
                        for (int i = 0; i <= indexlen; i++) {
                            int b;
                            int dat;
                            b = buff[ptr++];
                            b &= 0xFF;
                            dat = b;
                            b = buff[ptr++];
                            b &= 0xFF;
                            dat |= (b << 8);
                            b = buff[ptr++];
                            b &= 0xFF;
                            dat |= (b << 16);
                            b = buff[ptr++];
                            b &= 0xFF;
                            dat |= (b << 24);
                            indexptr[i] = dat;
                        }
                        return true;
                    }
                } catch (IOException ignored) {
                }
            }

            // インデックスの先頭から見出し語のポインタを拾っていく
            final int nindex = m_nindex;
            final int[] indexPtr =  new int[nindex + 1]; // インデックスポインタの配列確保
            mIndexPtr = indexPtr;
            if (mPdicInfoCache.createIndex(m_blockbits, nindex, indexPtr)) {
                byte[] buff = new byte[indexPtr.length * 4];
                int p = 0;
                for (int c = 0; c <= nindex; c++) {
                    int data = indexPtr[c];
                    buff[p++] = (byte) (data & 0xFF);
                    data >>= 8;
                    buff[p++] = (byte) (data & 0xFF);
                    data >>= 8;
                    buff[p++] = (byte) (data & 0xFF);
                    data >>= 8;
                    buff[p++] = (byte) (data & 0xFF);
                }
                if (indexcache != null) {
                    try (FileOutputStream fos = new FileOutputStream(indexcache)) {
                        fos.write(buff, 0, buff.length);
                    } catch (IOException ignored) {
                    }
                }
                return true;
            }
        }
        mIndexPtr = null;
        return false;
    }

    /**
     * num個目の見出し語の実体が入っているブロック番号を返す.
     */
    public int getBlockNo(final int num) {
        int blkptr = mIndexPtr[num] - m_blockbits;
        mLastIndex = num;
        if (m_blockbits == 4) {
            return mPdicInfoCache.getInt(blkptr);
        } else {
            return mPdicInfoCache.getShort(blkptr);
        }
    }

    /**
     * 次の０までの長さを返す.
     *
     * @param array target byte array
     * @param pos start position
     * @return length of index.
     */
    static int getLengthToNextZero(final byte[] array, final int pos) {
        return ArrayUtils.indexOf(array, (byte) 0, pos) - pos;
        // int len = 0;
        // while (array[pos + len] != 0)
        //     len++;
        // return len;
    }

    boolean isMatch() {
        return m_match;
    }

    public String getFilename() {
        return m_file.getName();
    }

    public int getSearchMax() {
        return m_searchmax;
    }

    public void setSearchMax(final int m) {
        m_searchmax = m;
    }

    public void setDicName(final String b) {
        m_dicname = b;
    }

    public String getDicName() {
        return m_dicname;
    }

    // 単語を検索する
    public boolean searchWord(final String word) {
        // 検索結果クリア
        int cnt = 0;
        mSearchResult.clear();

        int ret = searchIndexBlock(word);

        boolean match = false;

        boolean searchret = false;
        while (true) {
            // 最終ブロックは超えない
            if (ret < m_nindex) {
                // 該当ブロック読み出し
                int block = getBlockNo(ret++);
                byte[] pblk = readBlockData(block);
                if (pblk != null) {
                    mAnalyze.setBuffer(pblk);
                    mAnalyze.setSearch(word);
                    searchret = mAnalyze.searchWord();
                    // 未発見でEOBの時のみもう一回、回る
                    if (!searchret && mAnalyze.mEob) {
                        continue;
                    }
                }
            }
            // 基本一回で抜ける
            break;
        }
        if (searchret) {
            // 前方一致するものだけ結果に入れる
            do {
                PdicElement res = mAnalyze.getRecord();
                if (res == null) {
                    break;
                }
                // 完全一致するかチェック
                if (res.mIndex.compareTo(word) == 0) {
                    match = true;
                }
                mSearchResult.add(res);

                cnt++;
                // 取得最大件数超えたら打ち切り
            } while (cnt < m_searchmax && hasMoreResult(true));
        }
        return match;
    }

    // 前方一致する単語の有無を返す
    boolean searchPrefix(final String word) {
        int ret = searchIndexBlock(word);

        for (int blk = 0; blk < 2; blk++) {
            // 最終ブロックは超えない
            if (ret + blk >= m_nindex) {
                break;
            }
            int block = getBlockNo(ret + blk);

            // 該当ブロック読み出し
            byte[] pblk = readBlockData(block);

            if (pblk != null) {
                mAnalyze.setBuffer(pblk);
                mAnalyze.setSearch(word);

                if (mAnalyze.searchWord()) {
                    return true;
                }
            }
        }
        return false;
    }

    List getResult() {
        return mSearchResult;
    }

    public List getMoreResult() {
        mSearchResult.clear();
        if (mAnalyze != null) {
            int cnt = 0;
            // 前方一致するものだけ結果に入れる
            while (cnt < m_searchmax && hasMoreResult(true)) {
                PdicElement res = mAnalyze.getRecord();
                if (res == null) {
                    break;
                }
                mSearchResult.add(res);
                cnt++;
            }
        }
        return mSearchResult;
    }

    public boolean hasMoreResult(final boolean incrementptr) {
        boolean result = mAnalyze.hasMoreResult(incrementptr);
        if (!result) {
            if (mAnalyze.isEob()) {    // EOBなら次のブロック読み出し
                int nextindex = mLastIndex + 1;
                // 最終ブロックは超えない
                if (nextindex < m_nindex) {
                    int block = getBlockNo(nextindex);

                    // 該当ブロック読み出し
                    byte[] pblk = readBlockData(block);

                    if (pblk != null) {
                        mAnalyze.setBuffer(pblk);
                        result = mAnalyze.hasMoreResult(incrementptr);
                    }
                }
            }
        }
        return result;
    }

    /**
     * データブロックを読み込み.
     *
     * @param blkno
     * @return 読み込まれたデータブロック
     */
    byte[] readBlockData(final int blkno) {
        byte[] buff = new byte[0x200];
        byte[] pbuf = buff;
        try {
            mSrcStream.seek(m_bodyptr + (long) blkno * m_blocksize);

            // 1ブロック分読込(１セクタ分先読み)
            if (mSrcStream.read(pbuf, 0, 0x200) < 0) {
                return null;
            }

            // 長さ取得
            int len = ((int) (pbuf[0])) & 0xFF;
            len |= (((int) (pbuf[1])) & 0xFF) << 8;

            // ブロック長判定
            if ((len & 0x8000) != 0) { // 32bit
                len &= 0x7FFF;
            }
            if (len > 0) {
                // ブロック不足分読込
                if (len * m_blocksize > 0x200) {
                    pbuf = new byte[m_blocksize * len];
                    System.arraycopy(buff, 0, pbuf, 0, 0x200);
                    if (mSrcStream.read(pbuf, 0x200, len * m_blocksize - 0x200) < 0) {
                        return null;
                    }
                }
            } else {
                pbuf = null;
            }
            return pbuf;
        } catch (IOException ignored) {
        }
        return null;
    }

    final class AnalyzeBlock {
        private byte[] mBuff;
        private boolean mLongfield;
        private byte[] mWord;
        private int mFoundPtr = -1;
        private int mNextPtr = -1;
        private final byte[] mCompbuff = new byte[1024];
        private int mCompLen = 0;
        private boolean mEob = false;

        AnalyzeBlock() {
        }

        public void setBuffer(final byte[] buff) {
            mBuff = buff;
            mLongfield = ((buff[1] & 0x80) != 0);
            ByteBuffer mBB = ByteBuffer.wrap(buff);
            mBB.order(ByteOrder.LITTLE_ENDIAN);
            mNextPtr = 2;
            mEob = false;
            mCompLen = 0;
        }

        public void setSearch(final String word) {
            ByteBuffer buffer = encodetoByteBuffer(mMainCharset, word);
            mEncodeCache.put(word, buffer);
            mWord = new byte[buffer.limit()];
            System.arraycopy(buffer.array(), 0, mWord, 0, buffer.limit());
        }

        public boolean isEob() {
            return mEob;
        }

        /**
         * ブロックデータの中から指定語を探す.
         */
        public boolean searchWord() {
            final byte[] bytes = mWord;
            final byte[] buff = mBuff;
            final boolean longfield = mLongfield;
            final byte[] compbuff = mCompbuff;
            final int wordlen = bytes.length;

            mFoundPtr = -1;

            // 訳語データ読込
            int ptr = mNextPtr;
            mNextPtr = -1;
            while (true) {
                int flen = 0;
                int retptr = ptr;
                int b;

                b = buff[ptr++];
                flen |= (b & 0xFF);

                b = buff[ptr++];
                b <<= 8;
                flen |= (b & 0xFF00);

                if (longfield) {
                    b = buff[ptr++];
                    b <<= 16;
                    flen |= (b & 0xFF0000);

                    b = buff[ptr++];
                    b <<= 24;
                    flen |= (b & 0x7F000000);
                }
                if (flen == 0) {
                    mEob = true;
                    break;
                }
                int qtr = ptr;
                ptr += flen + 1;
                ptr++;


                // 圧縮長
                int complen = (int) buff[qtr++];
                complen &= 0xFF;

                // 見出し語属性 skip
                qtr++;

                // 見出し語圧縮位置保存
                while ((compbuff[complen++] = buff[qtr++]) != 0) ;

                // 見出し語の方が短ければ不一致
                if (complen < wordlen) {
                    continue;
                }


                // 前方一致で比較
                boolean equal = true;
                for (int i = 0; i < wordlen; i++) {

                    if (compbuff[i] != bytes[i]) {
                        equal = false;
                        int cc = compbuff[i];
                        cc &= 0xFF;
                        int cw = bytes[i];
                        cw &= 0xFF;
                        // 超えてたら打ち切る
                        if (cc > cw) {
                            return false;
                        }
                        break;
                    }
                }
                if (equal) {
                    mFoundPtr = retptr;
                    mNextPtr = ptr;
                    mCompLen = complen - 1;
                    return true;
                }
            }
            return false;
        }

        /**
         * 最後の検索結果の単語を返す.
         *
         * @return search result
         */
        PdicElement getRecord() {
            if (mFoundPtr == -1) {
                return null;
            }
            final PdicElement res = new PdicElement();

            res.mIndex = decodetoCharBuffer(mMainCharset, mCompbuff, 0, mCompLen).toString();
            // ver6対応 見出し語が、<検索インデックス><表示用文字列>の順に
            // 設定されていてるので、分割する。
            // それ以前のverではdispに空文字列を保持させる。

            final String indexstr = res.mIndex;
            final int tab = indexstr.indexOf('\t');
            if (tab == -1) {
                res.mDisp = "";
            } else {
                res.mIndex = indexstr.substring(0, tab);
                res.mDisp = indexstr.substring(tab + 1);
            }

            final byte[] buff = mBuff;
            final boolean longfield = mLongfield;
            byte attr = 0;

            // 訳語データ読込
            int ptr = mFoundPtr;

            if (longfield) {
                ptr += 4;
            } else {
                ptr += 2;
            }
            int qtr = ptr;

            // 圧縮長
            // int complen = buff[qtr++];
            // complen &= 0xFF;
            qtr++;

            // 見出し語属性 skip
            attr = buff[qtr++];

            // 見出し語 skip
            while (buff[qtr++] != 0) ;

            // 訳語
            if ((attr & 0x10) != 0) { // 拡張属性ありの時
                int trnslen = getLengthToNextZero(buff, qtr);
                res.mTrans = decodetoCharBuffer(mMainCharset, buff, qtr, trnslen).toString().replace("\r", "");
                qtr += trnslen; // 次のNULLまでスキップ

                // 拡張属性取得
                byte eatr;
                while (((eatr = buff[qtr++]) & 0x80) == 0) {
                    if ((eatr & (0x10 | 0x40)) == 0) { // バイナリOFF＆圧縮OFFの場合
                        if ((eatr & 0x0F) == 0x01) { // 用例
                            int len = getLengthToNextZero(buff, qtr);
                            res.mSample = decodetoCharBuffer(mMainCharset, buff, qtr, len).toString().replace("\r", "");
                            qtr += len; // 次のNULLまでスキップ
                        } else if ((eatr & 0x0F) == 0x02) { // 発音
                            int len = getLengthToNextZero(buff, qtr);
                            res.mPhone = decodetoCharBuffer(mPhoneCharset, buff, qtr, len).toString();
                            qtr += len; // 次のNULLまでスキップ
                        }
                    } else {
                        // バイナリ属性か圧縮属性が来たら打ち切り
                        break;
                    }
                }
            } else {
                // 残り全部が訳文
                res.mTrans = decodetoCharBuffer(mMainCharset, buff, qtr, mNextPtr - qtr).toString().replace("\r", "");
            }
            return res;
        }

        // 次の項目が検索語に前方一致するかチェックする
        public boolean hasMoreResult(final boolean incrementptr) {
            byte[] word;
            final byte[] buff = mBuff;
            final boolean longfield = mLongfield;
            final byte[] compbuff = mCompbuff;

            // next search
            if (mFoundPtr == -1) {
                return false;
            }
            word = mWord;

            int wordlen = word.length;

            // 訳語データ読込
            int ptr = mNextPtr;

            int retptr = ptr;
            int flen;
            int b;

            b = buff[ptr++];
            flen = (b & 0xFF);

            b = buff[ptr++];
            b <<= 8;
            flen |= (b & 0xFF00);

            if (longfield) {
                b = buff[ptr++];
                b <<= 16;
                flen |= (b & 0xFF0000);

                b = buff[ptr++];
                b <<= 24;
                flen |= (b & 0x7F000000);
            }
            if (flen == 0) {
                mEob = true;
                return false;
            }
            int qtr = ptr;
            ptr += flen + 1;
            ptr++;

            // 圧縮長
            int complen = buff[qtr++];
            complen &= 0xFF;

            // 見出し語属性 skip
            qtr++;

            // 見出し語圧縮位置保存
            while ((compbuff[complen++] = buff[qtr++]) != 0) ;

            // 見出し語の方が短ければ不一致
            if (complen < wordlen) {
                return false;
            }

            // 前方一致で比較
            boolean equal = true;
            for (int i = 0; i < wordlen; i++) {
                if (compbuff[i] != word[i]) {
                    equal = false;
                    int cc = compbuff[i];
                    cc &= 0xFF;
                    int cw = word[i];
                    cw &= 0xFF;
                    // 超えてたら打ち切る
                    if (cc > cw) {
                        return false;
                    }
                    break;
                }
            }
            if (equal && incrementptr) {
                mFoundPtr = retptr;
                mNextPtr = ptr;
                mCompLen = complen - 1;
            }
            return equal;
        }
    }
}