All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.rolfje.anonimatron.file.FileAnonymizerService Maven / Gradle / Ivy

package com.rolfje.anonimatron.file;

import com.rolfje.anonimatron.anonymizer.AnonymizerService;
import com.rolfje.anonimatron.configuration.Column;
import com.rolfje.anonimatron.configuration.Configuration;
import com.rolfje.anonimatron.configuration.DataFile;
import com.rolfje.anonimatron.progress.Progress;
import com.rolfje.anonimatron.progress.ProgressPrinter;
import com.rolfje.anonimatron.synonyms.Synonym;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.FileFilter;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

/**
 * Reads rows from a file and returns anonymized rows.
 */
public class FileAnonymizerService {
    private final Logger LOG = Logger.getLogger(FileAnonymizerService.class);

    private Configuration config;
    private AnonymizerService anonymizerService;
    private Progress progress;


    public FileAnonymizerService(Configuration config, AnonymizerService anonymizerService) {
        this.config = config;
        this.anonymizerService = anonymizerService;
    }


    public void printConfigurationInfo() {
        System.out.println("\nAnonymization process started\n");
        System.out.println("To do         : " + config.getFiles().size() + " files.\n");
    }


    public void anonymize() throws Exception {
        List files = expandDirectories(config.getFiles());
        System.out.println("Files to process: " + files.size());

        List fileFilters = getFileFilters();

        long totalBytes = 0;
        for (DataFile file : files) {
            totalBytes += new File(file.getInFile()).length();
        }

        progress = new Progress();
        progress.setTotalitemstodo(totalBytes);

        ProgressPrinter printer = new ProgressPrinter(progress);
        printer.setPrintIntervalMillis(1000);

// 		Enable this when printing is better tested.
//		printer.start();

        for (DataFile file : files) {
            File infile = new File(file.getInFile());

            boolean process = true;
            for (FileFilter fileFilter : fileFilters) {
                if (!fileFilter.accept(infile)) {
                    // Skip file
                    process = false;
                    progress.incItemsCompleted(infile.length());
                    // TODO possible bug: Which loop do we want to break out of?
                    continue;
                }
            }

            if (!process || new File(file.getOutFile()).exists()) {
                System.out.println("Skipping " + file.getInFile());
                progress.incItemsCompleted(infile.length());
                continue;
            }

            System.out.println("Anonymizing from " + file.getInFile());
            System.out.println("              to " + file.getOutFile());

            RecordReader reader = createReader(file);
            RecordWriter writer = createWriter(file);

            Map columns = toMap(file.getColumns());

            anonymize(
                    reader,
                    writer,
                    columns
            );

            reader.close();
            writer.close();
            progress.incItemsCompleted(infile.length());
        }
        printer.stop();

        System.out.println("\nAnonymization process completed.\n");
    }

    private List getFileFilters() throws Exception {
        List fileFilters = new ArrayList<>();
        List fileFilterStrings = config.getFileFilters();
        if (fileFilterStrings != null) {
            for (String fileFilterString : fileFilterStrings) {
                fileFilters.add(createFileFilter(fileFilterString));
            }
        }
        return fileFilters;
    }

    private List expandDirectories(List files) {
        ArrayList allFiles = new ArrayList<>();

        if (files == null || files.isEmpty()) {
            return allFiles;
        }

        for (DataFile dataFile : files) {

            // Get all input files
            List inFiles = getInputFiles(dataFile);

            // Check that we don't overwrite output files
            for (File inFile : inFiles) {

                File outFile = new File(dataFile.getOutFile());
                if (outFile.exists() && outFile.isDirectory()) {
                    outFile = new File(outFile, inFile.getName());
                }

                if (outFile.exists()) {
                    throw new RuntimeException("Output file exists: " + outFile.getAbsolutePath());
                }

                DataFile newDataFile = new DataFile();
                newDataFile.setColumns(dataFile.getColumns());
                newDataFile.setReader(dataFile.getReader());
                newDataFile.setWriter(dataFile.getWriter());
                newDataFile.setInFile(inFile.getAbsolutePath());
                newDataFile.setOutFile(outFile.getAbsolutePath());
                newDataFile.setDiscriminators(dataFile.getDiscriminators());
                allFiles.add(newDataFile);
            }
        }

        preventDataFileCollisions(allFiles);
        return allFiles;
    }

    void preventDataFileCollisions(List allFiles) {
        HashSet inFiles = new HashSet<>();
        for (DataFile dataFile : allFiles) {
            inFiles.add(dataFile.getInFile());
        }

        HashSet outFiles = new HashSet<>();
        for (DataFile dataFile : allFiles) {
            if (dataFile.getOutFile().equals(dataFile.getInFile())) {
                throw new RuntimeException("File used as both input and output: " + dataFile.getInFile() + ".");
            }

            if (outFiles.contains(dataFile.getOutFile())) {
                throw new RuntimeException("Configuration will write twice to the same file " + dataFile.getOutFile() + ".");
            }

            if (inFiles.contains(dataFile.getOutFile())) {
                throw new RuntimeException("Configuration will overwrite input file " + dataFile.getOutFile() + ".");
            }

            outFiles.add(dataFile.getOutFile());
        }
    }

    List getInputFiles(DataFile dataFile) {
        List inFiles = new ArrayList<>();
        File inFile = new File(dataFile.getInFile());

        if (inFile.exists() && inFile.isDirectory()) {
            File[] inputFiles = inFile.listFiles();

            for (File inputFile : inputFiles) {
                if (inputFile.isFile()) {
                    inFiles.add(inputFile);
                }
            }

        } else if (inFile.exists() && inFile.isFile()) {
            inFiles.add(inFile);
        } else {
            throw new RuntimeException("Input file does not exist: " + inFile.getAbsolutePath());
        }
        return inFiles;
    }

    void anonymize(RecordReader reader, RecordWriter writer, Map columns) throws Exception {
        while (reader.hasRecords()) {
            Record read = reader.read();

            if (read != null) {
                Record anonymized = anonymize(read, columns);
                writer.write(anonymized);
            }
        }
    }

    Record anonymize(Record record, Map columns) {
        Object[] values = new Object[record.getValues().length];
        for (int i = 0; i < record.getNames().length; i++) {
            String name = record.getNames()[i];
            Object value = record.getValues()[i];

            if (columns.containsKey(name)) {
                Column column = columns.get(name);
                Synonym synonym = anonymizerService.anonymize(column, value);
                values[i] = synonym.getTo();
            } else {
                values[i] = value;
            }
        }

        Record outputRecord = new Record(record.getNames(), values);

        if (LOG.isTraceEnabled()) {
            LOG.trace(record);
            LOG.trace(outputRecord);
        }
        return outputRecord;
    }

    private Map toMap(List columns) {
        HashMap map = new HashMap<>();

        if (columns == null || columns.isEmpty()) {
            return map;
        }

        for (Column column : columns) {
            map.put(column.getName(), column);
        }
        return map;
    }

    private RecordReader createReader(DataFile file) throws Exception {
        try {
            Class clazz = Class.forName(file.getReader());
            Constructor constructor = clazz.getConstructor(String.class);
            return (RecordReader) constructor.newInstance(file.getInFile());
        } catch (Exception e) {
            throw new RuntimeException("Problem creating reader " + file.getReader() + " for input file " + file.getInFile() + ".", e);
        }
    }

    private RecordWriter createWriter(DataFile file) throws Exception {
        try {
            Class clazz = Class.forName(file.getWriter());
            Constructor constructor = clazz.getConstructor(String.class);
            return (RecordWriter) constructor.newInstance(file.getOutFile());
        } catch (Exception e) {
            throw new RuntimeException("Problem creating writer " + file.getWriter() + " for output file " + file.getOutFile() + ".", e);
        }
    }

    private FileFilter createFileFilter(String fileFilterClass) throws Exception {
        try {
            Class clazz = Class.forName(fileFilterClass);
            Constructor constructor = clazz.getConstructor();
            return (FileFilter) constructor.newInstance();
        } catch (Exception e) {
            throw new RuntimeException("Problem creating file filter " + fileFilterClass + ".", e);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy