
com.dell.doradus.olap.aggregate.DuplicationDetection Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2014 Dell, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dell.doradus.olap.aggregate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.dell.doradus.common.CommonDefs;
import com.dell.doradus.common.TableDefinition;
import com.dell.doradus.olap.io.BSTR;
import com.dell.doradus.olap.io.FileDeletedException;
import com.dell.doradus.olap.merge.RestorableIxDoc;
import com.dell.doradus.search.FieldSet;
import com.dell.doradus.search.SearchResult;
import com.dell.doradus.search.SearchResultList;
import com.dell.doradus.search.util.HeapList;
import com.dell.doradus.utilities.Timer;
public class DuplicationDetection {
private static Logger LOG = LoggerFactory.getLogger("DuplicationDetection");
public static SearchResultList getDuplicateIDs(TableDefinition tableDef, List shards) {
// repeat if segment was merged
for(int i = 0; i < 5; i++) {
try {
return getDuplicateIDsInternal(tableDef, shards);
}catch(FileDeletedException ex) {
LOG.warn(ex.getMessage() + " - retrying: " + i);
continue;
}
}
throw new FileDeletedException("All retries to getDuplicateIDs failed");
//return getDuplicateIDsInternal(tableDef, dirs, shards);
}
public static SearchResultList getDuplicateIDsInternal(TableDefinition tableDef, List shards) {
Timer timer = new Timer();
LOG.debug("Find duplicate ids in {}/{}", tableDef.getAppDef().getAppName(), tableDef.getTableName());
int documentsCount = 0;
Map> result = new HashMap>();
List curShards = new ArrayList();
BSTR last_id = new BSTR();
last_id.length = -1;
HeapList heap = new HeapList<>(shards.size() - 1);
RestorableIxDoc current = null;
for(int i = 0; i < shards.size(); i++) {
current = new RestorableIxDoc(i, tableDef, shards.get(i));
current.next();
current = heap.AddEx(current);
}
while (current != null && current.id != null) {
documentsCount++;
if(!BSTR.isEqual(last_id, current.id)) {
if(curShards.size() > 1) {
result.put(last_id.toString(), new ArrayList(curShards));
}
last_id.set(current.id);
curShards.clear();
}
curShards.add(shards.get(current.segment));
current.next();
current = heap.AddEx(current);
}
if(curShards.size() > 1) {
result.put(last_id.toString(), new ArrayList(curShards));
}
SearchResultList res = new SearchResultList();
FieldSet fs = new FieldSet(tableDef);
res.documentsCount = documentsCount;
StringBuilder sb = new StringBuilder();
for(Map.Entry> elem : result.entrySet()) {
SearchResult r = new SearchResult();
r.fieldSet = fs;
r.scalars.put(CommonDefs.ID_FIELD, elem.getKey());
sb.setLength(0);
for(String shard : elem.getValue()) {
sb.append(shard);
sb.append(',');
}
sb.setLength(sb.length() - 1);
r.scalars.put("shards", sb.toString());
res.results.add(r);
}
LOG.debug("Found {} duplicate ids {}/{} merged in {}",
new Object[] { result.size(), tableDef.getAppDef().getAppName(), tableDef.getTableName(), timer });
return res;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy