org.modeshape.jcr.TextExtractors Maven / Gradle / Ivy
/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.jcr;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import org.modeshape.common.annotation.Immutable;
import org.modeshape.common.logging.Logger;
import org.modeshape.common.util.CheckArg;
import org.modeshape.common.util.StringUtil;
import org.modeshape.jcr.RepositoryConfiguration.Component;
import org.modeshape.jcr.api.text.TextExtractor;
import org.modeshape.jcr.text.TextExtractorOutput;
import org.modeshape.jcr.value.BinaryKey;
import org.modeshape.jcr.value.BinaryValue;
import org.modeshape.jcr.value.binary.AbstractBinaryStore;
import org.modeshape.jcr.value.binary.InMemoryBinaryValue;
/**
* Facility for managing {@link TextExtractor} instances and submitting text extraction work
*/
@Immutable
public final class TextExtractors {
private static final Logger LOGGER = Logger.getLogger(TextExtractors.class);
private final List extractors;
private final ExecutorService extractingQueue;
private final ConcurrentHashMap workerLatches;
public TextExtractors( ExecutorService extractingQueue,
List extractors ) {
this.extractingQueue = extractingQueue;
this.workerLatches = new ConcurrentHashMap();
this.extractors = extractors;
}
TextExtractors( JcrRepository.RunningState repository,
RepositoryConfiguration.TextExtraction extracting ) {
this(repository.context().getCachedTreadPool(extracting.getThreadPoolName()), getConfiguredExtractors(repository,
extracting));
}
protected void shutdown() {
extractors.clear();
extractingQueue.shutdown();
}
public boolean extractionEnabled() {
return !extractors.isEmpty();
}
public String extract( InMemoryBinaryValue inMemoryBinaryValue,
TextExtractor.Context context ) {
try {
String mimeType = inMemoryBinaryValue.getMimeType();
TextExtractorOutput output = new TextExtractorOutput();
// Run through the extractors and have them extract the text - the first one which accepts the mime-type will win
for (TextExtractor extractor : extractors) {
if (!extractor.supportsMimeType(mimeType)) {
continue;
}
extractor.extractFrom(inMemoryBinaryValue, output, context);
break;
}
return output.getText();
} catch (Exception e) {
LOGGER.error(e, JcrI18n.errorExtractingTextFromBinary, inMemoryBinaryValue.getHexHash(), e.getLocalizedMessage());
}
return null;
}
public CountDownLatch extract( AbstractBinaryStore store,
BinaryValue binaryValue,
TextExtractor.Context context ) {
if (!extractionEnabled()) {
return null;
}
if (binaryValue instanceof InMemoryBinaryValue) {
// We never extract the text for binary values this way ...
return null;
}
CheckArg.isNotNull(binaryValue, "binaryValue");
CountDownLatch latch = getWorkerLatch(binaryValue.getKey(), true);
extractingQueue.execute(new Worker(store, binaryValue, context, latch));
return latch;
}
public CountDownLatch getWorkerLatch( BinaryKey binaryKey,
boolean createIfMissing ) {
if (createIfMissing) {
CountDownLatch latch = new CountDownLatch(1);
CountDownLatch existingLatch = workerLatches.putIfAbsent(binaryKey, latch);
return existingLatch != null ? existingLatch : latch;
}
return workerLatches.get(binaryKey);
}
private static List getConfiguredExtractors( JcrRepository.RunningState repository,
RepositoryConfiguration.TextExtraction extracting ) {
List extractorComponents = extracting.getTextExtractors(repository.problems());
List extractors = new ArrayList(extractorComponents.size());
for (Component component : extractorComponents) {
try {
TextExtractor extractor = component.createInstance(TextExtractors.class.getClassLoader());
extractor.setLogger(ExtensionLogger.getLogger(extractor.getClass()));
extractors.add(extractor);
} catch (Throwable t) {
String desc = component.getName();
String repoName = repository.name();
repository.error(t, JcrI18n.unableToInitializeTextExtractor, desc, repoName, t.getMessage());
}
}
return extractors;
}
/**
* A unit of work which extracts text from a binary value, stores that text in a store and notifies a latch that the
* extraction operation has finished.
*/
protected final class Worker implements Runnable {
private final BinaryValue binaryValue;
private final TextExtractor.Context context;
private final AbstractBinaryStore store;
private final CountDownLatch latch;
protected Worker( AbstractBinaryStore store,
BinaryValue binaryValue,
TextExtractor.Context context,
CountDownLatch latch ) {
this.store = store;
this.binaryValue = binaryValue;
this.context = context;
this.latch = latch;
}
@SuppressWarnings( "synthetic-access" )
@Override
public void run() {
try {
// only extract text if there isn't a stored value for the binary key (note that any changes in the binary will
// produce a different key)
if (store.getExtractedText(binaryValue) != null) {
return;
}
String mimeType = binaryValue.getMimeType();
TextExtractorOutput output = new TextExtractorOutput();
// Run through the extractors and have them extract the text - the first one which accepts the mime-type will win
for (TextExtractor extractor : extractors) {
if (!extractor.supportsMimeType(mimeType)) {
continue;
}
extractor.extractFrom(binaryValue, output, context);
break;
}
String extractedText = output.getText();
if (extractedText != null && !StringUtil.isBlank(extractedText)) {
store.storeExtractedText(binaryValue, extractedText);
}
} catch (Exception e) {
LOGGER.error(e, JcrI18n.errorExtractingTextFromBinary, binaryValue.getHexHash(), e.getLocalizedMessage());
} finally {
// decrement the latch regardless of success/failure to avoid blocking, as extraction is not retried
latch.countDown();
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy