All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.artifact.terms.model.FileSegmentation Maven / Gradle / Ivy

There is a newer version: 0.132.0
Show newest version
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.artifact.terms.model;

import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringStats;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import lombok.Getter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

/**
 * A single file can be represented as sequence of FileSegments, each associated with further details.
 */
public class FileSegmentation {

    private static final Logger LOG = LoggerFactory.getLogger(FileSegmentation.class);

    @Getter
    final List fileSegments = new ArrayList<>();

    private final MatchPreprocessor matchPreprocessor;

    /**
     * Decompose the given file content.
     *
     * @param fileContent           The content of the file.
     * @param normalizationMetaData The normalisation metadata to use to segment the file.
     */
    public FileSegmentation(String fileContent, NormalizationMetaData normalizationMetaData) {
        this.matchPreprocessor = new MatchPreprocessor(normalizationMetaData);

        final String markedFileContent = preprocessForSegmentation(fileContent, this.matchPreprocessor);

        // use LICENSE-SEGMENT-MARKERS to segment the file context
        try {
            List markedFileSegments = matchPreprocessor.applySegments(markedFileContent);

            // use initial segments to create fileSegments
            for (String markedSegmentContent : markedFileSegments) {
                fileSegments.add(new FileSegment(markedSegmentContent, matchPreprocessor));
            }
        } catch (Throwable e) {
            // this was required due to stackoverflow errors in min.js files.
            LOG.info("Cannot apply segments.");
        }

        // postprocess segments (apply and revert; split and merge)
        try {
            postProcessSegments(matchPreprocessor, normalizationMetaData);
        } catch (Throwable e) {
            // this was required due to stackoverflow errors in min.js files.
            LOG.info("Cannot post-process segments.", e);
        }
    }

    /**
     * Preprocesses given license text by applying mappings for normalization and later segmentation.
     *
     * @param fileContent       The content of the file
     * @param matchPreprocessor The processor that removes markers and applies segmentation.
     *
     * @return The modulated file content.
     */
    public static String preprocessForSegmentation(String fileContent, MatchPreprocessor matchPreprocessor) {
        // NOTE: we do not apply masks on initial content level; not stable enough and formatting affected
        try {
            // apply mappings (all phases) to the original text; this may prevent/enable segmentation
            fileContent = matchPreprocessor.getNormalizationMetaData().applyMappings(fileContent);

            // NOTE: in case the file content is segmented, such that a mask spans several segments, the
            //   mask is not effective anymore.
        } catch (Throwable e) {
            // this was required due to stackoverflow errors in min.js files.
            LOG.info("Cannot apply mappings.");
        }
        return fileContent;
    }

    /**
     * Postprocesses given license text by removing empty segments and reverting segments based on reverts in license yaml file
     *
     * @param matchPreprocessor     The processor that removes markers and applies segmentation.
     * @param normalizationMetaData The normalisation metadata to use to segment the file.
     */
    protected void postProcessSegments(MatchPreprocessor matchPreprocessor, NormalizationMetaData normalizationMetaData) {

        // remove empty segments
        fileSegments.removeIf(segment -> StringUtils
                .isEmpty(segment.getNormalizedContent().getNormalizedString()));

        // revert/merge segments
        for (int i = 0; i < fileSegments.size() - 1; i++) {
            if (fileSegments.get(i).getNormalizedContent().getNormalizedString().trim().isEmpty()) continue;

            final FileSegment currentFileSegment = fileSegments.get(i);
            final FileSegment nextFileSegment = fileSegments.get(i + 1);

            final StringStats current = currentFileSegment.getNormalizedContent();
            final StringStats next = nextFileSegment.getNormalizedContent();

            boolean merged = false;
            for (TermsMetaData lmd : normalizationMetaData.getLicenseMetaDataMap().values()) {
                if (lmd.getSegmentation() != null) {
                    for (MatchSequence ms : lmd.getSegmentation().getRevert()) {

                        boolean matchMatched = false;
                        boolean matchNextMatched = false;

                        if (ms.getMatch() == null) {
                            matchMatched = true;
                        } else {
                            for (int match : matches(ms.getMatch(), current)) {
                                final StringStats matchStats = StringStats.normalize(ms.getMatch(), false);
                                if (match == current.getNormalizedString().length() - matchStats.getNormalizedString().length()) {
                                    matchMatched = true;
                                    break;
                                }
                            }
                        }
                        if (ms.getMatchNext() == null) {
                            matchNextMatched = true;
                        } else {
                            for (int match :  matches(ms.getMatchNext(), next)) {
                                if (match == 0) {
                                    matchNextMatched = true;
                                    break;
                                }
                            }
                        }

                        if (matchMatched && matchNextMatched) {
                            final FileSegment mergedFileSegment = new FileSegment(
                                    currentFileSegment.getMarkedContent().trim() + " " +
                                            nextFileSegment.getMarkedContent().trim(), matchPreprocessor);

                            // replace existing segment
                            fileSegments.set(i, mergedFileSegment);

                            // remove obsolete segment
                            fileSegments.remove(i + 1);

                            // proceed on the same spot
                            i--;
                            merged = true;
                            break;
                        }
                    }
                }

                // break from outer for loop (iterating lmd) if merged (merge only once)
                if (merged) break;
            }
        }
    }

    protected int[] matches(String match, StringStats content) {
        return content.allMatches(StringStats.normalize(match, true));
    }

    /**
     * Getter for file segments.
     *
     * @param index the index for a specific file segment
     *
     * @return The {@link FileSegment} at the given index.
     */
    public FileSegment getFileSegment(int index) {
        return fileSegments.get(index);
    }

    /**
     * Getter for amount of file segments.
     *
     * @return Returns the segment counnt.
     */
    public int getSegmentCount() {
        return fileSegments.size();
    }

    /**
     * Adds a segment-start marker and a segment-end marker to each file segment.
     *
     * @return The segment string.
     */
    public StringBuilder getSegmentsString() {
        StringBuilder sb = new StringBuilder();
        for (FileSegment fileSegment : fileSegments) {
            sb.append(String.format("SEGMENT-START-----------------------------%n"));
            sb.append(fileSegment.getContent());
            sb.append(String.format("%nSEGMENT-END-------------------------------%n"));
        }
        return sb;
    }

    /**
     * Adds a segment-start marker and a segment-end marker to each marked file segment.
     *
     * @return Returns the marked segment string.
     */
    public StringBuilder getMarkedSegmentsString() {
        StringBuilder sb = new StringBuilder();
        for (FileSegment fileSegment : fileSegments) {
            sb.append(String.format("SEGMENT-START-----------------------------%n"));
            sb.append(fileSegment.getMarkedContent());
            sb.append(String.format("%nSEGMENT-END-------------------------------%n"));
        }
        return sb;
    }

    /**
     * Creates a folder containing license file segments for processing using scancode.
     *
     * @param targetFolder  The folder for saving the file segments
     *
     * @throws IOException IOException are thrown in case of issues during file access.
     *
     * @return Returns the list of segments that have been combined into the target folder.
     */
    public List combineSegmentsAndWriteFoldersForScancode(File targetFolder) throws IOException {
        int i = 0;
        List fileSegments = combineSegments();
        for (FileSegment fileSegment : fileSegments) {

            String fileName = "segment-" + i + ".txt";
            File file = new File(targetFolder, fileName);
            FileUtils.write(file, fileSegment.getContent(), StandardCharsets.UTF_8);
            i++;
        }
        return fileSegments;
    }

    /**
     * Revise segments to combine and deletes empty segments.
     *
     * @return Returns the list of combined segments.
     */
    private List combineSegments() {
        List combinedFileSegments = new ArrayList<>();
        List temporaryFileSegments = new ArrayList<>();

        for (FileSegment fileSegment : fileSegments) {
            if (fileSegment.getNormalizedSRP().getMatchedTerms().isEmpty()) {
                temporaryFileSegments.add(fileSegment);
            } else {
                combinedFileSegments.add(mergeFileSegments(temporaryFileSegments));
                temporaryFileSegments.clear();
                combinedFileSegments.add(fileSegment);
            }
        }
        combinedFileSegments.add(mergeFileSegments(temporaryFileSegments));
        combinedFileSegments.removeIf(combinedFileSegment -> combinedFileSegment.getContent().isEmpty());
        return combinedFileSegments;
    }

    /**
     * Merges a list of segments into one segment.
     *
     * @param temporaryFileSegments unmerged file segments that are supposed to be merged.
     *
     * @return Returns a single merged {@link FileSegment} from the given temporaryFileSegments.
     */
    private FileSegment mergeFileSegments(List temporaryFileSegments) {
        final StringBuilder content = new StringBuilder();
        for (FileSegment fileSegment : temporaryFileSegments) {
            content.append(" ").append(fileSegment.getContent());
        }
        return new FileSegment(content.toString(), matchPreprocessor);
    }

    /**
     * Uses the already split segment and all manipulations that have been applied to recompose the full text.
     *
     * Leaves SEGMENT-MARKER to differentiate the segments in the merged text.
     *
     * @return Recomposed text from the segments.
     */
    public StringStats mergeSegmentedText() {
        final StringBuilder orig = new StringBuilder();
        final StringBuilder normalized = new StringBuilder();

        for (FileSegment fileSegment : fileSegments) {
            // use the marked content
            orig.append(fileSegment.getMarkedContent());

            // reinsert segment markers to indicate segments
            orig.append("SEGMENT-MARKER");

            // append the normalized content (surrounded by whitespaces)
            normalized.append(fileSegment.getNormalizedContent().getNormalizedString());
        }

        final StringStats result = StringStats.normalize(orig.toString(), false);

        // by joining segments we may have inserted additional whitespaces; compensate
        result.update(normalized.toString().replaceAll("\\s+", " "));

        return result;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy