Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*******************************************************************************
* Copyright (c) 2015-2019 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.deeplearning4j.text.tokenization.tokenizerfactory;
import lombok.Getter;
import lombok.NonNull;
import lombok.Setter;
import org.deeplearning4j.text.tokenization.tokenizer.BertWordPieceStreamTokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.BertWordPieceTokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.BertWordPiecePreProcessor;
import java.io.*;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Map;
import java.util.NavigableMap;
import java.util.TreeMap;
/**
* Bert WordPiece tokenizer
* @author Paul Dubs
*/
public class BertWordPieceTokenizerFactory implements TokenizerFactory {
private final NavigableMap vocab;
@Getter @Setter
private TokenPreProcess preTokenizePreProcessor;
@Getter @Setter
private TokenPreProcess tokenPreProcessor;
private Charset charset;
/**
* @param vocab Vocabulary, as a navigable map
* @param lowerCaseOnly If true: tokenization should convert all characters to lower case
* @param stripAccents If true: strip accents off characters. Usually same as lower case. Should be true when using "uncased" official BERT TensorFlow models
*/
public BertWordPieceTokenizerFactory(NavigableMap vocab, boolean lowerCaseOnly, boolean stripAccents) {
this(vocab, new BertWordPiecePreProcessor(lowerCaseOnly, stripAccents, vocab));
}
/**
* @param vocab Vocabulary, as a navigable map
* @param preTokenizePreProcessor The preprocessor that should be used on the raw strings, before splitting
*/
public BertWordPieceTokenizerFactory(NavigableMap vocab, TokenPreProcess preTokenizePreProcessor) {
this.vocab = vocab;
this.preTokenizePreProcessor = preTokenizePreProcessor;
}
/**
* Create a BertWordPieceTokenizerFactory, load the vocabulary from the specified file.
* The expected format is a \n seperated list of tokens for vocab entries
*
* @param pathToVocab Path to vocabulary file
* @param lowerCaseOnly If true: tokenization should convert all characters to lower case
* @param stripAccents If true: strip accents off characters. Usually same as lower case. Should be true when using "uncased" official BERT TensorFlow models
* @param charset Character set for the file
* @throws IOException If an error occurs reading the vocab file
*/
public BertWordPieceTokenizerFactory(File pathToVocab, boolean lowerCaseOnly, boolean stripAccents, @NonNull Charset charset) throws IOException {
this(loadVocab(pathToVocab, charset), lowerCaseOnly, stripAccents);
this.charset = charset;
}
/**
* Create a BertWordPieceTokenizerFactory, load the vocabulary from the specified input stream.
* The expected format for vocabulary is a \n seperated list of tokens for vocab entries
* @param vocabInputStream Input stream to load vocabulary
* @param lowerCaseOnly If true: tokenization should convert all characters to lower case
* @param stripAccents If true: strip accents off characters. Usually same as lower case. Should be true when using "uncased" official BERT TensorFlow models
* @param charset Character set for the vocab stream
* @throws IOException If an error occurs reading the vocab stream
*/
public BertWordPieceTokenizerFactory(InputStream vocabInputStream, boolean lowerCaseOnly, boolean stripAccents, @NonNull Charset charset) throws IOException {
this(loadVocab(vocabInputStream, charset), lowerCaseOnly, stripAccents);
this.charset = charset;
}
@Override
public Tokenizer create(String toTokenize) {
Tokenizer t = new BertWordPieceTokenizer(toTokenize, vocab, preTokenizePreProcessor, tokenPreProcessor);
return t;
}
@Override
public Tokenizer create(InputStream toTokenize) {
Tokenizer t = new BertWordPieceStreamTokenizer(toTokenize, charset, vocab, preTokenizePreProcessor, tokenPreProcessor);
return t;
}
public Map getVocab(){
return Collections.unmodifiableMap(vocab);
}
/**
* The expected format is a \n seperated list of tokens for vocab entries
*
*
* foo
* bar
* baz
*
*
* the tokens should not have any whitespace on either of their sides
*
* @param is InputStream
* @return A vocab map with the popper sort order for fast traversal
*/
public static NavigableMap loadVocab(InputStream is, Charset charset) throws IOException {
final TreeMap map = new TreeMap<>(Collections.reverseOrder());
try (final BufferedReader reader = new BufferedReader(new InputStreamReader(is, charset))) {
String token;
int i = 0;
while ((token = reader.readLine()) != null) {
map.put(token, i++);
}
}
return map;
}
public static NavigableMap loadVocab(File vocabFile, Charset charset) throws IOException {
return loadVocab(new FileInputStream(vocabFile), charset);
}
}