All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.dkm.pikes.resources.reader.LKCollectionReader Maven / Gradle / Ivy

Go to download

A collection of Java classes for accessing and querying a number of NLP resources.

The newest version!

package eu.fbk.dkm.pikes.resources.reader;

import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import se.lth.cs.nlp.nlputils.core.Ax;
import se.lth.cs.nlp.nlputils.core.ListMap;

import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* SAX stuff. */

public class LKCollectionReader {

	private ArrayList textFiles = new ArrayList();
	private int nextFile = -1;

	private HashMap> annFileNames = new HashMap();

	private static final Pattern BASE_PAT = Pattern.compile("name=\"base\">(.*)");

	private static final Pattern ON_FILE_PAT = Pattern.compile("scope=\"(.*?)\"");
	private static final Pattern ON_FILES_PAT = Pattern.compile("on-files=\"(.*?)\"");

	private String workDir = null;

	private HashSet usedAnnotations; // unimplemented

	public LKCollectionReader(String dir) throws IOException {
		this(null, dir, null);
	}

	public LKCollectionReader(String dir, List fileList) throws IOException {
		this(null, dir, fileList);
	}

	private LKCollectionReader(Collection usedAnnotations, String dir,
							   List fileList) throws IOException {
		workDir = dir;
		if (usedAnnotations != null) {
			this.usedAnnotations = new HashSet(usedAnnotations);
		}

		if (fileList == null) {
			makeFileList(dir);
		}
		else {
			makeFileList(dir, fileList);
		}

		nextFile = 0;

		//System.out.println(textFiles);
		//System.out.println(annFileNames);

	}

	private void makeFileList(String dir) throws IOException {
		File df = new File(dir);
		if (!df.isDirectory()) {
			throw new IllegalArgumentException("Must specify a directory");
		}

		File[] list = df.listFiles();
		for (File f : list) {

			//Scanner sc = new Scanner(f);
			BufferedReader br = Ax.openFileReader(f.getAbsolutePath());
			String line = br.readLine();

			int count = 0;
			while (count < 3 && line != null) {
				//String line = sc.nextLine();
				count++;
				if (line.contains(" afns = annFileNames.get(base);
							if (afns == null) {
								afns = new ArrayList();
								annFileNames.put(base, afns);
								//System.out.println("Annotation file: " + f + "->" + base);
							}
							afns.add(f.getName());
							break;
						}
						line = br.readLine();
					}
					break;
				}
				line = br.readLine();
			}
			//sc.close();
			br.close();
		}
	}

	private void makeFileList(String dir, List listedFiles) throws IOException {
		ArrayList copy = new ArrayList(listedFiles);
		File df = new File(dir);
		if (!df.isDirectory()) {
			throw new IllegalArgumentException("Must specify a directory");
		}

		for (ListIterator iter = copy.listIterator(); iter.hasNext(); ) {
			String fn = iter.next();
			//if(fn.startsWith("database/docs/")) {
			//	fn = fn.replaceFirst("database/docs/", "");
			//}
			if (!fn.endsWith(".xml")) // TODO gz/bz2???
			{
				fn = fn + ".lktext.xml";
			}
			File f = new File(dir + File.separator + fn);
			if (!f.exists() && fn.startsWith(dir)) {
				fn = fn.substring(dir.length());
				f = new File(dir + File.separator + fn);
			}
			if (!f.exists()) {
				String fn2 = fn.replaceAll("/", "_");
				f = new File(dir + File.separator + fn2);
				if (!f.exists()) {
					throw new IllegalArgumentException("file " + fn + " does not exist");
				}
				fn = fn2;
			}
			iter.set(fn);
			textFiles.add(f);
		}

		// TODO should only include annotation for the selected files
		File[] list = df.listFiles();
		for (File f : list) {
			Scanner sc = new Scanner(f);
			int count = 0;
			while (count < 3 && sc.hasNextLine()) {
				String line = sc.nextLine();
				count++;
				if (line.contains(" afns = annFileNames.get(base);
								if (afns == null) {
									afns = new ArrayList();
									annFileNames.put(base, afns);
								}
								afns.add(f.getName());
							}
							break;
						}
					}
					break;
				}
			}
			sc.close();
		}
	}

	public boolean hasNext() {
		return (nextFile >= 0 && nextFile < textFiles.size());
		//return false;
	}

	public LKAnnotatedText next() {

		try {

			//System.out.println("getNext: current = " + textFiles.get(nextFile));

			// open input stream to file
			File file = (File) textFiles.get(nextFile);

			//System.out.println("its annotations: " + annFileNames.get(file.getName()));

			LKTextParserCallback tcb = null;

			XMLReader reader = makeXMLReader();
			tcb = new LKTextParserCallback();
			reader.setContentHandler(tcb);

			InputSource is = new InputSource(new FileInputStream(file));

			reader.parse(is);

			String text = tcb.getText();

//			String sourceFile = tcb.getMetaInfo().get("source");
//			System.out.println("source = " + sourceFile);

			ArrayList layers
					= readAnnotations(file.getName(),
					annFileNames.get(file.getName()));

			nextFile++;

			LKAnnotatedText out = new LKAnnotatedText();
			out.rawText = text;
			out.layers = layers;
			out.metaInfo = tcb.metaInfo;
			return out;

		} catch (Exception e) {
			throw new RuntimeException(e);
		}

	}

	LKAnnotationLayer getLayer(String provides, ArrayList ls) {
		for (LKAnnotationLayer l : ls) {
			if (l.provides.equals(provides)) {
				return l;
			}
		}
		return null;
	} 

	/* SAX stuff. */

	private XMLReader makeXMLReader()
			throws SAXException, ParserConfigurationException {

		javax.xml.parsers.SAXParserFactory saxParserFactory =
				javax.xml.parsers.SAXParserFactory.newInstance();

		final javax.xml.parsers.SAXParser saxParser
				= saxParserFactory.newSAXParser();

		final XMLReader parser = saxParser.getXMLReader();

		return parser;
	}

	private static class LKTextParserCallback extends DefaultHandler
			implements ContentHandler {

		private boolean insideText = false;
		private StringBuilder sb = new StringBuilder();
		private HashMap metaInfo = new HashMap();

		private boolean insideTag = false;
		private StringBuilder tag = null;
		private String currentTagName = null;

		public void startElement(String namespace, String localname,
								 String type, Attributes attributes) {
			if (type.equals("tag")) {
				String name = attributes.getValue("name");
				if (name == null) {
					throw new RuntimeException("no name for tag");
				}
				currentTagName = name;
				tag = new StringBuilder();
				insideTag = true;
			}
			else if (type.equals("text")) {
				insideText = true;
				//sb.clear();
			}
			else if (type.matches("lk-text|meta-info")) {
				// do nothing
			}
			else {
				throw new RuntimeException("illegal type: " + type);
			}

		}

		public void endElement(String namespace, String localname,
							   String type) {
			if (type.equals("text")) {
				insideText = false;
			}
			else if (type.equals("tag")) {
				insideTag = false;
				metaInfo.put(currentTagName, tag.toString());
			}
		}

		public void characters(char[] ch, int start, int len) {
			if (insideText) {
				String s = new String(ch, start, len);
				sb.append(s);
			}
			else if (insideTag) {
				String s = new String(ch, start, len);
				tag.append(s);
			}
		}

		String getText() {
			return sb.toString();
		}

		HashMap getMetaInfo() {
			return metaInfo;
		}

	}

	private ArrayList readAnnotations(String baseText, ArrayList files) throws IOException {
		if (files == null) {
			throw new RuntimeException("file list = null");
		}

		ListMap layerMap = new ListMap();

		ArrayList sorted = sortAnnFiles(files);

		ArrayList out = new ArrayList();

		if (sorted.get(0).equals(baseText)) {
			sorted.remove(0);
		}

		for (String file : sorted) {
			try {
				//System.out.println("Processing file " + file);
				XMLReader reader = makeXMLReader();
				LKAnnotationParserCallback acb
						= new LKAnnotationParserCallback(baseText, layerMap);
				reader.setContentHandler(acb);
				String fullFileName = workDir + File.separatorChar + file;
				InputSource is = new InputSource(fullFileName);
				reader.parse(is);
				layerMap.putAll(file, acb.getLayers());
				out.addAll(acb.getLayers());
			} catch (SAXException e) {
				throw new IOException(e);
			} catch (ParserConfigurationException e) {
				throw new IOException(e);
			}
		}

		return out;
	}

	private static class LKAnnotationParserCallback extends DefaultHandler
			implements ContentHandler {

		private String baseTextFile;

		private ListMap layerMap;
		private ArrayList layers = new ArrayList();
		private LKAnnotationLayer current;

		private boolean onTextFile = false;
		//private LKAnnotationLayer currentOn = null;
		private ArrayList currentScope = null;

		private boolean insideTag = false;
		private boolean insideEntity = false;
		private StringBuilder tag = null;
		private String currentTagName = null;

		private HashMap metaInfo = new HashMap();

		private LinkedList stack;

		LKAnnotationParserCallback(String baseTextFile,
								   ListMap layerMap) {
			this.baseTextFile = baseTextFile;
			this.layerMap = layerMap;
		}

		public void startElement(String namespace, String localname,
								 String type, Attributes attributes) {

			if (insideEntity) {
				DataElementNode parent = stack.getLast();
				DataElementNode n = new DataElementNode(type);
				for (int i = attributes.getLength() - 1; i >= 0; i--) {
					String k = attributes.getQName(i);
					String v = attributes.getValue(i);
					n.attributes.put(k, v);
				}
				parent.children.add(n);
				stack.add(n);
			}
			else if (type.equals("e")) {
				LKAnnotationEntity e = new LKAnnotationEntity();

				String on = attributes.getValue("on");
				String start = attributes.getValue("start");
				String end = attributes.getValue("end");
				String from = attributes.getValue("from");
				String to = attributes.getValue("to");

				if (on != null) {
					if (start != null) {
						throw new RuntimeException("on!=null => start=null");
					}
					if (end != null) {
						throw new RuntimeException("on!=null => end=null");
					}
					if (from != null) {
						throw new RuntimeException("on!=null => from=null");
					}
					if (to != null) {
						throw new RuntimeException("on!=null => to=null");
					}
				}

				if (from != null || to != null) {
					if (from == null) {
						throw new RuntimeException("to!=null => from!=null");
					}
					if (to == null) {
						throw new RuntimeException("from!=null => to!=null");
					}
					if (end != null) {
						throw new RuntimeException("from!=null => end=null");
					}
					if (start != null) {
						throw new RuntimeException("from!=null => start=null");
					}
				}
				if (start != null || end != null) {
					if (end == null) {
						throw new RuntimeException("start!=null => end!=null");
					}
					if (start == null) {
						throw new RuntimeException("end!=null => start!=null");
					}
				}

				if (onTextFile && start != null) {
					if (!start.startsWith("#")) {
						throw new RuntimeException("start must begin with #");
					}
					if (!end.startsWith("#")) {
						throw new RuntimeException("start must begin with #");
					}
					e.cstart = Integer.parseInt(start.substring(1));
					e.cend = Integer.parseInt(end.substring(1)) + 1;
				}
				else if (start != null) {
					LKAnnotationLayer[] l1 = new LKAnnotationLayer[1];
					int ix1 = dereferenceId(start, l1);
					LKAnnotationLayer[] l2 = new LKAnnotationLayer[1];
					int ix2 = dereferenceId(end, l2);
					if (l1[0] != l2[0]) {
						throw new RuntimeException("different layers in start-end");
					}
					e.referred = new ArrayList();
					for (int i = ix1; i <= ix2; i++) {
						e.referred.add(l1[0].entityList.get(i));
					}
				}
				else if (on != null) {
					String[] set = on.split("\\,\\s*");
					e.referred = new ArrayList();
					LKAnnotationLayer[] l = new LKAnnotationLayer[1];
					for (String s : set) {
						int ix = dereferenceId(s, l);
						e.referred.add(l[0].entityList.get(ix));
					}
				}
				else if (from != null) {
					LKAnnotationLayer[] l = new LKAnnotationLayer[1];
					int ix1 = dereferenceId(from, l);
					e.from = l[0].entityList.get(ix1);
					int ix2 = dereferenceId(to, l);
					e.to = l[0].entityList.get(ix2);
				}

				String id = attributes.getValue("id");
				if (id == null) {
					throw new RuntimeException("no id");
				}
				if (current.idToIndex.containsKey(id)) {
					throw new RuntimeException("id must be unique");
				}

				e.localURI = id;

				//e.label = attributes.getValue("l");

				current.idToIndex.put(id, current.entityList.size());
				current.entityList.add(e);

				insideEntity = true;
				stack = new LinkedList();
				DataElementNode n = new DataElementNode("__ROOT__");
				stack.add(n);
				e.data = n;

			}
			else if (type.equals("tag")) {
				String name = attributes.getValue("name");
				if (name == null) {
					throw new RuntimeException("no name for tag");
				}
				currentTagName = name;
				tag = new StringBuilder();
				insideTag = true;
			}
			else if (type.equals("annotation")) {
				current = new LKAnnotationLayer();
				String scopeFile = attributes.getValue("scope");
				String onFiles = attributes.getValue("on-files");
				if (onFiles != null) {
					throw new RuntimeException("on-files is unimplemented: currently, we can only handle annotation layers with scope");
				}

				if (scopeFile != null && !scopeFile.equals("")) {
					if (scopeFile.contains("lktext")) {
						currentScope = null;
					}
					else {
						currentScope = layerMap.get(scopeFile);
						if (currentScope == null) {
							throw new RuntimeException("scope not found: |" + scopeFile + "|");
						}
					}
				}
				else {
					currentScope = layers;
				}


				onTextFile = scopeFile != null && scopeFile.equals(baseTextFile);
				current.scopeFile = scopeFile;
				current.provides = attributes.getValue("provides");
				layers.add(current);
			}
			else if (type.matches("lk-annotation|meta-info")) {
				// do nothing
			}
			else {
				throw new RuntimeException("illegal type: " + type);
			}
		}

		private int dereferenceId(String ref, LKAnnotationLayer[] lout) {
			if (ref == null) {
				throw new IllegalArgumentException("null reference");
			}
			ref = ref.trim();
			int ix = ref.indexOf('#');
			LKAnnotationLayer l = null;
			String fileRef = null;
			String idRef = null;
			if (ix == -1) {
				//idRef = ref;
				throw new RuntimeException("No fragment identifier");
			}
			else {
				fileRef = ref.substring(0, ix);
				idRef = ref.substring(ix + 1);
			}

			ArrayList scope;
			if (fileRef == null || fileRef.equals("")) {
				scope = currentScope;
			}
			//else if(fileRef.equals("$"))
			//	scope = layers;
			else {
				//throw new RuntimeException("sorry, this type of file reference is still unimplemented");
				scope = layerMap.get(fileRef);
				if (scope == null) {
					throw new RuntimeException("scope not found: " + fileRef);
				}
			}
			for (LKAnnotationLayer ll : scope) {
				Integer llIx = ll.idToIndex.get(idRef);
				if (llIx != null) {
					lout[0] = ll;
					return llIx;
				}
			}
			throw new RuntimeException("entity " + idRef + " not found");
		}

		public void endElement(String namespace, String localname,
							   String type) {

			if (insideEntity) {
				if (stack.size() == 1) {
					insideEntity = false;
					// 
				}
				else {
					stack.removeLast();
				}
			}
			else if (type.equals("tag")) {
				insideTag = false;
				metaInfo.put(currentTagName, tag.toString());
			}
			else if (type.equals("annotation")) {
				current = null;
			}
		}

		public void characters(char[] ch, int start, int len) {
			if (insideEntity) {
				String s = new String(ch, start, len);
				DataTextNode n = new DataTextNode(s);
				stack.getLast().children.add(n);
			}
			else if (insideTag) {
				String s = new String(ch, start, len);
				tag.append(s);
			}
		}

		ArrayList getLayers() {
			return layers;
		}

		HashMap getMetaInfo() {
			return metaInfo;
		}

	}

	private ArrayList sortAnnFiles(ArrayList files) throws IOException {
		//System.out.println("Before sorting: files = " + files);
		ListMap dg = createAnnDepGraph(files);
		//System.out.println("dg = " + dg);
		ArrayList out = tsort(dg);
		//System.out.println("After sorting: files = " + out);
		return out;
	}

	private  ArrayList tsort(ListMap depGraph) {

		HashSet starts = new HashSet();
		for (T k : depGraph.keySet()) {
			starts.add(k);
		}

		for (T k : depGraph.keySet()) {
			starts.removeAll(depGraph.get(k));
		}

		if (starts.size() == 0) {
			throw new RuntimeException("cyclic or empty graph!");
		}

		LinkedList q = new LinkedList(starts);
		ArrayList out = new ArrayList();

		while (!q.isEmpty()) {
			T t = q.removeFirst();
			out.add(t);
			ArrayList sl = depGraph.get(t);
			if (sl != null) {
				starts.clear();
				starts.addAll(sl);
				for (T s : sl) {
					ArrayList sl2 = depGraph.get(s);
					if (sl2 != null) {
						starts.removeAll(sl2);
					}
				}
				q.addAll(starts);
			}
		}
		HashSet seen = new HashSet();
		for (Iterator it = out.iterator(); it.hasNext(); ) {
			T t = it.next();
			if (seen.contains(t)) {
				it.remove();
			}
			else {
				seen.add(t);
			}
		}
		return out;
	}

	private  ArrayList tsort_orig(ListMap depGraph) {
		HashSet starts = new HashSet();
		for (T k : depGraph.keySet()) {
			starts.add(k);
		}

		for (T k : depGraph.keySet()) {
			starts.removeAll(depGraph.get(k));
		}

		if (starts.size() == 0) {
			throw new RuntimeException("cyclic or empty graph!");
		}

		LinkedList q = new LinkedList(starts);
		ArrayList out = new ArrayList();

		while (!q.isEmpty()) {
			T t = q.removeFirst();
			out.add(t);
			ArrayList sl = depGraph.get(t);

			if (sl != null) {
				q.addAll(sl);
			}
		}
		HashSet seen = new HashSet();
		for (Iterator it = out.iterator(); it.hasNext(); ) {
			T t = it.next();
			if (seen.contains(t)) {
				it.remove();
			}
			else {
				seen.add(t);
			}
		}

		return out;
	}

	private ListMap createAnnDepGraph(ArrayList files) throws IOException {
		ListMap out = new ListMap();
		for (String fn : files) {
			String full = workDir + File.separatorChar + fn;
			BufferedReader br = new BufferedReader(new FileReader(full));
			String line = br.readLine();
			while (line != null) {
				line = line.trim();
				if (!line.startsWith("




© 2015 - 2025 Weber Informatics LLC | Privacy Policy