
org.deeplearning4j.spark.text.functions.TextPipeline Maven / Gradle / Ivy
/*
*
* * Copyright 2015 Skymind,Inc.
* *
* * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
*
*/
package org.deeplearning4j.spark.text.functions;
import org.apache.spark.Accumulator;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.deeplearning4j.berkeley.Counter;
import org.deeplearning4j.berkeley.Pair;
import org.deeplearning4j.models.embeddings.loader.VectorsConfiguration;
import org.deeplearning4j.models.word2vec.Huffman;
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache;
import org.deeplearning4j.spark.text.accumulators.WordFreqAccumulator;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicLong;
/**
* A spark based text pipeline
* with minimum word frequency and stop words
*
* @author Adam Gibson
*/
@SuppressWarnings("unchecked")
public class TextPipeline {
//params
private JavaRDD corpusRDD;
private int numWords;
private int nGrams;
private String tokenizer;
private String tokenizerPreprocessor;
private List stopWords = new ArrayList<>();
//Setup
private JavaSparkContext sc;
private Accumulator> wordFreqAcc;
private Broadcast> stopWordBroadCast;
// Return values
private JavaRDD, AtomicLong>> sentenceWordsCountRDD;
private VocabCache vocabCache = new AbstractCache<>();
private Broadcast> vocabCacheBroadcast;
private JavaRDD> vocabWordListRDD;
private JavaRDD sentenceCountRDD;
private long totalWordCount;
private boolean useUnk;
private VectorsConfiguration configuration;
// Empty Constructor
public TextPipeline() {}
// Constructor
public TextPipeline(JavaRDD corpusRDD, Broadcast
© 2015 - 2025 Weber Informatics LLC | Privacy Policy