All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.indexing.SimpleMedlineXMLCollection Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is SimpleMedlineXMLCollection.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Ben He  (original author) 
 */
package org.terrier.indexing;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.List;

import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Files;

/** Initial implementation of a class that generates a Collection with Documents from a 
  * series of XML files in the Medline format. It process a limited number of documents 
  * in an XML file to avoid OutOfMemory problem in case the XML file is too large.

* Properties: *

    *
  • lowercase - lower case all terms obtained. Highly recommended.
  • *
  • indexing.simplexmlcollection.reformxml - will try to reform broken & entities.
  • *
  • xml.doc.buffer.size - The maximum number of documents to process per interation.
  • *
* @author Ben He */ public class SimpleMedlineXMLCollection extends SimpleXMLCollection { /** The BufferedReader of the XML file to be processed. */ private BufferedReader br = null; /** The name of the currently processed XML file. */ private String currentFilename = null; /** The number of documents processed in the current XML file. */ protected int currentFileDocCounter = 0; /** The tag of documents in the XML files. */ public final String docTag = " files, String ignored1, String BlacklistSpecFilename, String ignored2) { super(files, ignored1, BlacklistSpecFilename, ignored2); } /** * Parse through up to a limited number of documents in the XML file. The limit is * specified by property xml.doc.buffer.size. */ protected boolean openNextFile() { if (FilesToProcess.size() == 0&&br==null) return false; if (br == null) { currentFilename = (String)FilesToProcess.remove(0); try{ br = Files.openFileReader(currentFilename); currentFileDocCounter=0; }catch(IOException ioe){ logger.error("Could not open next file", ioe); throw new RuntimeException(ioe); } } String filename = currentFilename+".tmp.gz"; BufferedWriter bw = null; int docCounter = 0; int foundBefore = currentFileDocCounter; try{ bw = (BufferedWriter)Files.writeFileWriter(filename); // create the temporiary file bw.write(fileTag+EOL); String str = null; // read until the first doc while ((str=br.readLine())!=null){ str = str.trim(); if (str.startsWith(docTag)){ bw.write(str+EOL); break; } } String currentStr = str; if (str == null) currentStr = ""; while ((str=br.readLine())!=null){ currentStr = str.trim(); if (currentStr.startsWith(docEndTag)){ // write the line, count++ bw.write(currentStr+EOL); docCounter++; currentFileDocCounter++; if (docCounter == NUMBER_OF_DOCS_IN_BUFFER){ break; } }else{ bw.write(currentStr+EOL); } } // set br = null if end of file if (str==null) br = null; if (!currentStr.startsWith(fileEndTag)) bw.write(fileEndTag); bw.close(); bw = null; }catch(IOException ioe){ logger.warn("Problem reading", ioe); } if(logger.isDebugEnabled()){ logger.debug("Processing file "+currentFilename+" for docs "+foundBefore+" - "+currentFileDocCounter); } try{ if(bReformXML) { //this replaces all & with & /* NB: This operation MAY be dangerous, as it MAY disrupt the encoding * of strings in the document while copying * Use at your own discretion, and test thoroughly. * TODO check */ if(logger.isDebugEnabled()){ logger.debug("Copying xml to temporary file"); } File temp = java.nio.file.Files.createTempFile("simpleMedlineXMLcollection", ".xml").toFile(); Files.copyFile(new File(filename), temp); //if(logger.isDebugEnabled()){ //logger.debug("parsing "+temp.getAbsoluteFile()); //} xmlDoc = dBuilder.parse(temp); if (! temp.delete()) logger.debug("Problem delete temp file"); } else { xmlDoc = dBuilder.parse(Files.openFileStream(filename)); } } catch (org.xml.sax.SAXException saxe) { logger.error("WARNING: Error parsing XML file "+ filename+ " : ", saxe); return openNextFile(); //bad: Recursion } catch (IOException ioe) { logger.error("WARNING: Error opening XML file "+ filename+ " : ",ioe); return openNextFile(); //bad: recursion } if (DocumentTags) { findDocumentElement(xmlDoc); } else { Documents.addLast(new XMLDocument(xmlDoc)); } if(logger.isInfoEnabled()){ logger.info("Found "+Documents.size() + " more documents in "+currentFilename); } xmlDoc = null; // remove the temporiary file if (!(new File(filename)).delete()){ logger.error("Unable to delete file "+filename); } filename = null; return true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy