All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.molgenis.data.merge.RepositoryMerger Maven / Gradle / Ivy

There is a newer version: 1.21.5
Show newest version
package org.molgenis.data.merge;

import static org.molgenis.data.EntityMetaData.AttributeRole.ROLE_ID;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

import org.molgenis.MolgenisFieldTypes;
import org.molgenis.data.AttributeMetaData;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.EntityMetaData;
import org.molgenis.data.Query;
import org.molgenis.data.Repository;
import org.molgenis.data.elasticsearch.ElasticsearchRepositoryCollection;
import org.molgenis.data.support.AbstractEntity;
import org.molgenis.data.support.DefaultAttributeMetaData;
import org.molgenis.data.support.DefaultEntityMetaData;
import org.molgenis.data.support.MapEntity;
import org.molgenis.data.support.QueryImpl;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

/**
 * Created by charbonb on 01/09/14.
 */
@Component
public class RepositoryMerger
{
	private final static String ID = "ID";
	private final DataService dataService;

	@Autowired
	public RepositoryMerger(DataService dataService)
	{
		this.dataService = dataService;
	}

	/**
	 * Create a new merged repository Metadata is merged based on the common attributes (those remain at root level) All
	 * non-common level attributes are organised in 1 compound attribute per repository Data of all repositories is
	 * merged based on the common columns
	 * 
	 * @param repositoryList
	 *            list of repositories to be merged
	 * @param commonAttributes
	 *            list of common attributes, these columns are use to 'join'/'merge' on
	 * @param mergedRepository
	 *            the resulting repository default of 1000 for param: batchSize number of records after which the result
	 *            is added or updated in the repository
	 * @return mergedRepository ElasticSearchRepository containing the merged data
	 */
	public Repository merge(List repositoryList, List commonAttributes,
			Repository mergedRepository)
	{
		return merge(repositoryList, commonAttributes, mergedRepository, 1000);
	}

	/**
	 * Create a new merged repository Metadata is merged based on the common attributes (those remain at root level) All
	 * non-common level attributes are organised in 1 compound attribute per repository Data of all repositories is
	 * merged based on the common columns
	 * 
	 * @param repositoryList
	 *            list of repositories to be merged
	 * @param commonAttributes
	 *            list of common attributes, these columns are use to 'join'/'merge' on
	 * @param mergedRepository
	 *            the resulting repository
	 * @param batchSize
	 *            number of records after which the result is added or updated in the repository
	 * @return mergedRepository ElasticSearchRepository containing the merged data
	 */
	public Repository merge(List repositoryList, List commonAttributes,
			Repository mergedRepository, int batchSize)
	{
		mergeData(repositoryList, dataService.getRepository(mergedRepository.getName()), commonAttributes, batchSize);

		return mergedRepository;
	}

	/**
	 * Merge the data of all repositories based on the common columns
	 */
	private void mergeData(List originalRepositoriesList, Repository resultRepository,
			List commonAttributes, int batchSize)
	{
		for (Repository repository : originalRepositoriesList)
		{
			List addedEntities = new ArrayList();
			List updatedEntities = new ArrayList();
			for (Entity entity : repository)
			{
				boolean newEntity = false;

				Entity mergedEntity = getMergedEntity(resultRepository, commonAttributes, entity);
				// if no entity for all the common columns exists, create a new one, containing these fields
				if (mergedEntity == null)
				{
					newEntity = true;
					mergedEntity = createMergedEntity(commonAttributes, entity);
				}
				// add all data for non common fields
				EntityMetaData entityMeta = entity.getEntityMetaData();
				for (AttributeMetaData attr : entityMeta.getAtomicAttributes())
				{
					if ((!attr.equals(entityMeta.getIdAttribute()) || attr.isVisible())
							&& !containsIgnoreCase(attr.getName(), commonAttributes))
					{
						mergedEntity.set(getMergedAttributeName(repository, attr.getName()),
								entity.get(attr.getName()));
					}
				}
				if (newEntity)
				{
					addedEntities.add(mergedEntity);
				}
				else
				{
					updatedEntities.add(mergedEntity);
				}

				// write to repository after every 1000 entities
				if (addedEntities.size() == batchSize)
				{
					resultRepository.add(addedEntities.stream());
					addedEntities = new ArrayList();
				}
				if (updatedEntities.size() == batchSize)
				{
					resultRepository.update(updatedEntities.stream());
					updatedEntities = new ArrayList();
				}
			}
			// write remaining entities to repository
			resultRepository.add(addedEntities.stream());
			resultRepository.update(updatedEntities.stream());
		}
	}

	/**
	 * create a new entity based on the merged entity metadata
	 */
	private AbstractEntity createMergedEntity(List commonAttributes, Entity entity)
	{
		AbstractEntity mergedEntity = new MapEntity(ID);
		mergedEntity.set(ID, UUID.randomUUID().toString());

		for (AttributeMetaData attributeMetaData : commonAttributes)
		{
			mergedEntity.set(attributeMetaData.getName(), entity.get(attributeMetaData.getName()));
		}
		return mergedEntity;
	}

	/**
	 * check if an entity for the common attributes already exists and if so, return it
	 */
	private Entity getMergedEntity(Repository repository, List commonAttributes, Entity entity)
	{
		Query findMergedEntityQuery = new QueryImpl();
		for (AttributeMetaData attributeMetaData : commonAttributes)
		{
			if (!findMergedEntityQuery.getRules().isEmpty()) findMergedEntityQuery = findMergedEntityQuery.and();
			findMergedEntityQuery = findMergedEntityQuery.eq(attributeMetaData.getName(),
					entity.get(attributeMetaData.getName()));
		}

		Entity result = repository.findOne(findMergedEntityQuery);
		return result;
	}

	/**
	 * Create new EntityMetaData with the common attributes at root level, and all other columns in a compound attribute
	 * per original repository
	 */
	public EntityMetaData mergeMetaData(List repositoryList, List commonAttrs,
			AttributeMetaData commonIdAttr, String outRepositoryName)
	{
		DefaultEntityMetaData mergedMetaData = new DefaultEntityMetaData(outRepositoryName);
		mergedMetaData.setBackend(ElasticsearchRepositoryCollection.NAME);
		mergedMetaData.addAttribute(ID, ROLE_ID).setVisible(false);

		for (AttributeMetaData commonAttr : commonAttrs)
		{
			if (commonAttr.equals(commonIdAttr))
			{
				// Ignore hidden id attributes
				if (commonAttr.isVisible())
				{
					// We added a new ID, save old attribute but do not use it as id
					commonAttr = new DefaultAttributeMetaData(commonAttr);
				}
			}
			else
			{
				mergedMetaData.addAttributeMetaData(commonAttr);
			}
		}

		for (Repository repository : repositoryList)
		{
			mergeRepositoryMetaData(commonAttrs, mergedMetaData, repository);
		}
		return mergedMetaData;
	}

	/**
	 * Add a compound attribute for a repository containing all "non-common" attributes
	 */
	private void mergeRepositoryMetaData(List commonAttributes, DefaultEntityMetaData mergedMetaData,
			Repository repository)
	{
		EntityMetaData originalRepositoryMetaData = repository.getEntityMetaData();
		DefaultAttributeMetaData repositoryCompoundAttribute = new DefaultAttributeMetaData(repository.getName(),
				MolgenisFieldTypes.FieldTypeEnum.COMPOUND);
		List attributeParts = new ArrayList<>();
		for (AttributeMetaData originalRepositoryAttr : originalRepositoryMetaData.getAttributes())
		{
			if (!containsIgnoreCase(originalRepositoryAttr.getName(), commonAttributes))
			{
				if (!originalRepositoryAttr.equals(originalRepositoryMetaData.getIdAttribute())
						|| originalRepositoryAttr.isVisible())
				{
					DefaultAttributeMetaData attributePartMetaData = copyAndRename(originalRepositoryAttr,
							getMergedAttributeName(repository, originalRepositoryAttr.getName()),
							getMergedAttributeLabel(repository, originalRepositoryAttr.getLabel()));
					if (originalRepositoryAttr.getDataType().getEnumType()
							.equals(MolgenisFieldTypes.FieldTypeEnum.COMPOUND))
					{
						addCompoundAttributeParts(repository, originalRepositoryAttr, attributePartMetaData);
					}
					attributeParts.add(attributePartMetaData);
				}
			}
		}
		repositoryCompoundAttribute.setAttributesMetaData(attributeParts);
		mergedMetaData.addAttributeMetaData(repositoryCompoundAttribute);
	}

	/**
	 * Recursively add all the attributes in an compound attribute
	 */
	private void addCompoundAttributeParts(Repository repository, AttributeMetaData originalRepositoryAttributeMetaData,
			DefaultAttributeMetaData attributePartMetaData)
	{
		List subAttributeParts = new ArrayList();
		for (AttributeMetaData originalRepositorySubAttributeMetaData : originalRepositoryAttributeMetaData
				.getAttributeParts())
		{
			DefaultAttributeMetaData subAttributePartMetaData = copyAndRename(originalRepositorySubAttributeMetaData,
					getMergedAttributeName(repository, originalRepositorySubAttributeMetaData.getName()),
					getMergedAttributeLabel(repository, originalRepositoryAttributeMetaData.getLabel()));
			subAttributePartMetaData
					.setLabel(getMergedAttributeLabel(repository, originalRepositorySubAttributeMetaData.getLabel()));
			if (subAttributePartMetaData.getDataType().getEnumType().equals(MolgenisFieldTypes.FieldTypeEnum.COMPOUND))
			{
				addCompoundAttributeParts(repository, originalRepositorySubAttributeMetaData, subAttributePartMetaData);
			}
			subAttributeParts.add(subAttributePartMetaData);
		}
		attributePartMetaData.setAttributesMetaData(subAttributeParts);
	}

	/**
	 * Check if an specific attributename is present in a list of AttributeMetadata
	 */
	private boolean containsIgnoreCase(String input, List list)
	{
		for (AttributeMetaData attributeMetaData : list)
		{
			if (input.equalsIgnoreCase(attributeMetaData.getName())) return true;
		}
		return false;
	}

	/**
	 * Create a name for an attribute based on the attribute name in the original repository and the original repository
	 * name itself.
	 */
	private String getMergedAttributeName(Repository repository, String attributeName)
	{
		return repository.getName() + "_" + attributeName;
	}

	/**
	 * Create a label for an attribute based on the attribute label in the original repository and the original
	 * repository name itself.
	 */
	private String getMergedAttributeLabel(Repository repository, String attributeLabel)
	{
		return attributeLabel + "(" + repository.getName() + ")";
	}

	private DefaultAttributeMetaData copyAndRename(AttributeMetaData attributeMetaData, String name, String label)
	{
		DefaultAttributeMetaData result = new DefaultAttributeMetaData(name,
				attributeMetaData.getDataType().getEnumType());
		result.setDescription(attributeMetaData.getDescription());
		result.setNillable(true);// We got a problem if a attr is required in one entitymeta and missing in another
		result.setReadOnly(false);
		result.setDefaultValue(attributeMetaData.getDefaultValue());
		result.setRefEntity(attributeMetaData.getRefEntity());
		result.setLabel(label);
		result.setVisible(attributeMetaData.isVisible());
		result.setUnique(attributeMetaData.isUnique());
		result.setAggregateable(attributeMetaData.isAggregateable());
		result.setRange(attributeMetaData.getRange());

		return result;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy