com.metaeffekt.artifact.terms.model.FileSegmentation Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.artifact.terms.model;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringStats;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import lombok.Getter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
/**
* A single file can be represented as sequence of FileSegments, each associated with further details.
*/
public class FileSegmentation {
private static final Logger LOG = LoggerFactory.getLogger(FileSegmentation.class);
@Getter
final List fileSegments = new ArrayList<>();
private final MatchPreprocessor matchPreprocessor;
/**
* Decompose the given file content.
*
* @param fileContent The content of the file.
* @param normalizationMetaData The normalisation metadata to use to segment the file.
*/
public FileSegmentation(String fileContent, NormalizationMetaData normalizationMetaData) {
this.matchPreprocessor = new MatchPreprocessor(normalizationMetaData);
final String markedFileContent = preprocessForSegmentation(fileContent, this.matchPreprocessor);
// use LICENSE-SEGMENT-MARKERS to segment the file context
try {
List markedFileSegments = matchPreprocessor.applySegments(markedFileContent);
// use initial segments to create fileSegments
for (String markedSegmentContent : markedFileSegments) {
fileSegments.add(new FileSegment(markedSegmentContent, matchPreprocessor));
}
} catch (Throwable e) {
// this was required due to stackoverflow errors in min.js files.
LOG.info("Cannot apply segments.");
}
// postprocess segments (apply and revert; split and merge)
try {
postProcessSegments(matchPreprocessor, normalizationMetaData);
} catch (Throwable e) {
// this was required due to stackoverflow errors in min.js files.
LOG.info("Cannot post-process segments.", e);
}
}
/**
* Preprocesses given license text by applying mappings for normalization and later segmentation.
*
* @param fileContent The content of the file
* @param matchPreprocessor The processor that removes markers and applies segmentation.
*
* @return The modulated file content.
*/
public static String preprocessForSegmentation(String fileContent, MatchPreprocessor matchPreprocessor) {
// NOTE: we do not apply masks on initial content level; not stable enough and formatting affected
try {
// apply mappings (all phases) to the original text; this may prevent/enable segmentation
fileContent = matchPreprocessor.getNormalizationMetaData().applyMappings(fileContent);
// NOTE: in case the file content is segmented, such that a mask spans several segments, the
// mask is not effective anymore.
} catch (Throwable e) {
// this was required due to stackoverflow errors in min.js files.
LOG.info("Cannot apply mappings.");
}
return fileContent;
}
/**
* Postprocesses given license text by removing empty segments and reverting segments based on reverts in license yaml file
*
* @param matchPreprocessor The processor that removes markers and applies segmentation.
* @param normalizationMetaData The normalisation metadata to use to segment the file.
*/
protected void postProcessSegments(MatchPreprocessor matchPreprocessor, NormalizationMetaData normalizationMetaData) {
// remove empty segments
fileSegments.removeIf(segment -> StringUtils
.isEmpty(segment.getNormalizedContent().getNormalizedString()));
// revert/merge segments
for (int i = 0; i < fileSegments.size() - 1; i++) {
if (fileSegments.get(i).getNormalizedContent().getNormalizedString().trim().isEmpty()) continue;
final FileSegment currentFileSegment = fileSegments.get(i);
final FileSegment nextFileSegment = fileSegments.get(i + 1);
final StringStats current = currentFileSegment.getNormalizedContent();
final StringStats next = nextFileSegment.getNormalizedContent();
boolean merged = false;
for (TermsMetaData lmd : normalizationMetaData.getLicenseMetaDataMap().values()) {
if (lmd.getSegmentation() != null) {
for (MatchSequence ms : lmd.getSegmentation().getRevert()) {
boolean matchMatched = false;
boolean matchNextMatched = false;
if (ms.getMatch() == null) {
matchMatched = true;
} else {
for (int match : matches(ms.getMatch(), current)) {
final StringStats matchStats = StringStats.normalize(ms.getMatch(), false);
if (match == current.getNormalizedString().length() - matchStats.getNormalizedString().length()) {
matchMatched = true;
break;
}
}
}
if (ms.getMatchNext() == null) {
matchNextMatched = true;
} else {
for (int match : matches(ms.getMatchNext(), next)) {
if (match == 0) {
matchNextMatched = true;
break;
}
}
}
if (matchMatched && matchNextMatched) {
final FileSegment mergedFileSegment = new FileSegment(
currentFileSegment.getMarkedContent().trim() + " " +
nextFileSegment.getMarkedContent().trim(), matchPreprocessor);
// replace existing segment
fileSegments.set(i, mergedFileSegment);
// remove obsolete segment
fileSegments.remove(i + 1);
// proceed on the same spot
i--;
merged = true;
break;
}
}
}
// break from outer for loop (iterating lmd) if merged (merge only once)
if (merged) break;
}
}
}
protected int[] matches(String match, StringStats content) {
return content.allMatches(StringStats.normalize(match, true));
}
/**
* Getter for file segments.
*
* @param index the index for a specific file segment
*
* @return The {@link FileSegment} at the given index.
*/
public FileSegment getFileSegment(int index) {
return fileSegments.get(index);
}
/**
* Getter for amount of file segments.
*
* @return Returns the segment counnt.
*/
public int getSegmentCount() {
return fileSegments.size();
}
/**
* Adds a segment-start marker and a segment-end marker to each file segment.
*
* @return The segment string.
*/
public StringBuilder getSegmentsString() {
StringBuilder sb = new StringBuilder();
for (FileSegment fileSegment : fileSegments) {
sb.append(String.format("SEGMENT-START-----------------------------%n"));
sb.append(fileSegment.getContent());
sb.append(String.format("%nSEGMENT-END-------------------------------%n"));
}
return sb;
}
/**
* Adds a segment-start marker and a segment-end marker to each marked file segment.
*
* @return Returns the marked segment string.
*/
public StringBuilder getMarkedSegmentsString() {
StringBuilder sb = new StringBuilder();
for (FileSegment fileSegment : fileSegments) {
sb.append(String.format("SEGMENT-START-----------------------------%n"));
sb.append(fileSegment.getMarkedContent());
sb.append(String.format("%nSEGMENT-END-------------------------------%n"));
}
return sb;
}
/**
* Creates a folder containing license file segments for processing using scancode.
*
* @param targetFolder The folder for saving the file segments
*
* @throws IOException IOException are thrown in case of issues during file access.
*
* @return Returns the list of segments that have been combined into the target folder.
*/
public List combineSegmentsAndWriteFoldersForScancode(File targetFolder) throws IOException {
int i = 0;
List fileSegments = combineSegments();
for (FileSegment fileSegment : fileSegments) {
String fileName = "segment-" + i + ".txt";
File file = new File(targetFolder, fileName);
FileUtils.write(file, fileSegment.getContent(), StandardCharsets.UTF_8);
i++;
}
return fileSegments;
}
/**
* Revise segments to combine and deletes empty segments.
*
* @return Returns the list of combined segments.
*/
private List combineSegments() {
List combinedFileSegments = new ArrayList<>();
List temporaryFileSegments = new ArrayList<>();
for (FileSegment fileSegment : fileSegments) {
if (fileSegment.getNormalizedSRP().getMatchedTerms().isEmpty()) {
temporaryFileSegments.add(fileSegment);
} else {
combinedFileSegments.add(mergeFileSegments(temporaryFileSegments));
temporaryFileSegments.clear();
combinedFileSegments.add(fileSegment);
}
}
combinedFileSegments.add(mergeFileSegments(temporaryFileSegments));
combinedFileSegments.removeIf(combinedFileSegment -> combinedFileSegment.getContent().isEmpty());
return combinedFileSegments;
}
/**
* Merges a list of segments into one segment.
*
* @param temporaryFileSegments unmerged file segments that are supposed to be merged.
*
* @return Returns a single merged {@link FileSegment} from the given temporaryFileSegments.
*/
private FileSegment mergeFileSegments(List temporaryFileSegments) {
final StringBuilder content = new StringBuilder();
for (FileSegment fileSegment : temporaryFileSegments) {
content.append(" ").append(fileSegment.getContent());
}
return new FileSegment(content.toString(), matchPreprocessor);
}
/**
* Uses the already split segment and all manipulations that have been applied to recompose the full text.
*
* Leaves SEGMENT-MARKER to differentiate the segments in the merged text.
*
* @return Recomposed text from the segments.
*/
public StringStats mergeSegmentedText() {
final StringBuilder orig = new StringBuilder();
final StringBuilder normalized = new StringBuilder();
for (FileSegment fileSegment : fileSegments) {
// use the marked content
orig.append(fileSegment.getMarkedContent());
// reinsert segment markers to indicate segments
orig.append("SEGMENT-MARKER");
// append the normalized content (surrounded by whitespaces)
normalized.append(fileSegment.getNormalizedContent().getNormalizedString());
}
final StringStats result = StringStats.normalize(orig.toString(), false);
// by joining segments we may have inserted additional whitespaces; compensate
result.update(normalized.toString().replaceAll("\\s+", " "));
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy