All Downloads are FREE. Search and download functionalities are using the official Maven repository.

GNormPluslib.BioCDoc Maven / Gradle / Ivy

package GNormPluslib;

import com.pengyifan.bioc.*;
import com.pengyifan.bioc.io.BioCCollectionReader;
import com.pengyifan.bioc.io.BioCCollectionWriter;
import com.pengyifan.bioc.io.BioCDocumentWriter;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.time.LocalDate;
import java.time.ZoneId;
import javax.xml.stream.XMLStreamException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class BioCDoc {
	public ArrayList PMIDs = new ArrayList();
	public ArrayList> PassageNames = new ArrayList();
	public ArrayList> PassageOffsets = new ArrayList();
	public ArrayList> PassageContexts = new ArrayList();
	public ArrayList>> Annotations = new ArrayList();

	public String BioCFormatCheck(String InputFile) throws IOException {
		try (BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"))) {
			BioCCollection collection = bioCCollectionReader.readCollection();
		} catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e) {
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
			String line = "";
			String status = "";
			String Pmid = "";
			boolean tiabs = false;
			Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$");
			while ((line = br.readLine()) != null) {
				Matcher mat = patt.matcher(line);
				if (mat.find()) {
					if (Pmid.equals("")) {
						Pmid = mat.group(1);
					} else if (!Pmid.equals(mat.group(1))) {
						return "[Error]: " + InputFile + " - A blank is needed between " + Pmid + " and " + mat.group(1) + ".";
					}
					status = "tiabs";
					tiabs = true;
				} else if (line.contains("\t")) {
				} else if (line.length() == 0) {
					if (status.equals("")) {
						if (Pmid.equals("")) {
							return "[Error]: " + InputFile + " - It's neither BioC nor PubTator format. PMID is empty.";
						} else {
							return "[Error]: " + InputFile + " - A redundant blank is after " + Pmid + ".";
						}
					}
					Pmid = "";
					status = "";
				}
			}
			br.close();
			if (tiabs == false) {
				return "[Error]: " + InputFile + " - It's neither BioC nor PubTator format.";
			}
			if (status.equals("")) {
				return "PubTator";
			} else {
				return "[Error]: " + InputFile + " - The last column missed a blank.";
			}
		}
		return "BioC";
	}

	public void PubTator2BioC(String input, String output) throws IOException, XMLStreamException {
		final BioCCollectionWriter bioCCollectionWriter = new BioCCollectionWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
		BioCCollection biocCollection = new BioCCollection();
		ZoneId zonedId = ZoneId.of("America/Montreal");
		LocalDate today = LocalDate.now(zonedId);
		biocCollection.setDate(today.toString());
		biocCollection.setKey("BioC.key");
		biocCollection.setSource("GNormPlus");
		BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
		ArrayList ParagraphType = new ArrayList();
		ArrayList ParagraphContent = new ArrayList();
		ArrayList annotations = new ArrayList();
		String line;
		String Pmid = "";
		while ((line = inputfile.readLine()) != null) {
			if (line.contains("|") && !line.contains("\t")) {
				String str[] = line.split("\\|", -1);
				Pmid = str[0];
				if (str[1].equals("t")) {
					str[1] = "title";
				}
				if (str[1].equals("a")) {
					str[1] = "abstract";
				}
				ParagraphType.add(str[1]);
				if (str.length == 3) {
					String txt = str[2];
					txt = txt.replaceAll("ω", "w");
					txt = txt.replaceAll("μ", "u");
					txt = txt.replaceAll("κ", "k");
					txt = txt.replaceAll("α", "a");
					txt = txt.replaceAll("γ", "g");
					txt = txt.replaceAll("ɣ", "g");
					txt = txt.replaceAll("β", "b");
					txt = txt.replaceAll("×", "x");
					txt = txt.replaceAll("‑", "-");
					txt = txt.replaceAll("¹", "1");
					txt = txt.replaceAll("²", "2");
					txt = txt.replaceAll("°", "o");
					txt = txt.replaceAll("ö", "o");
					txt = txt.replaceAll("é", "e");
					txt = txt.replaceAll("à", "a");
					txt = txt.replaceAll("Á", "A");
					txt = txt.replaceAll("ε", "e");
					txt = txt.replaceAll("θ", "O");
					txt = txt.replaceAll("•", ".");
					txt = txt.replaceAll("µ", "u");
					txt = txt.replaceAll("λ", "r");
					txt = txt.replaceAll("⁺", "+");
					txt = txt.replaceAll("ν", "v");
					txt = txt.replaceAll("ï", "i");
					txt = txt.replaceAll("ã", "a");
					txt = txt.replaceAll("≡", "=");
					txt = txt.replaceAll("ó", "o");
					txt = txt.replaceAll("³", "3");
					txt = txt.replaceAll("〖", "[");
					txt = txt.replaceAll("〗", "]");
					txt = txt.replaceAll("Å", "A");
					txt = txt.replaceAll("ρ", "p");
					txt = txt.replaceAll("ü", "u");
					txt = txt.replaceAll("ɛ", "e");
					txt = txt.replaceAll("č", "c");
					txt = txt.replaceAll("š", "s");
					txt = txt.replaceAll("ß", "b");
					txt = txt.replaceAll("═", "=");
					txt = txt.replaceAll("£", "L");
					txt = txt.replaceAll("Ł", "L");
					txt = txt.replaceAll("ƒ", "f");
					txt = txt.replaceAll("ä", "a");
					txt = txt.replaceAll("–", "-");
					txt = txt.replaceAll("⁻", "-");
					txt = txt.replaceAll("〈", "<");
					txt = txt.replaceAll("〉", ">");
					txt = txt.replaceAll("χ", "X");
					txt = txt.replaceAll("Đ", "D");
					txt = txt.replaceAll("‰", "%");
					txt = txt.replaceAll("·", ".");
					txt = txt.replaceAll("→", ">");
					txt = txt.replaceAll("←", "<");
					txt = txt.replaceAll("ζ", "z");
					txt = txt.replaceAll("π", "p");
					txt = txt.replaceAll("τ", "t");
					txt = txt.replaceAll("ξ", "X");
					txt = txt.replaceAll("η", "h");
					txt = txt.replaceAll("ø", "0");
					txt = txt.replaceAll("Δ", "D");
					txt = txt.replaceAll("∆", "D");
					txt = txt.replaceAll("∑", "S");
					txt = txt.replaceAll("Ω", "O");
					txt = txt.replaceAll("δ", "d");
					txt = txt.replaceAll("σ", "s");
					txt = txt.replaceAll("Φ", "F");
					txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]", " ");
					ParagraphContent.add(txt);
				} else {
					ParagraphContent.add("- No text -");
				}
			} else if (line.contains("\t")) {
				String anno[] = line.split("\t");
				if (anno.length == 6) {
					annotations.add(anno[1] + "\t" + anno[2] + "\t" + anno[3] + "\t" + anno[4] + "\t" + anno[5]);
				} else if (anno.length == 5) {
					annotations.add(anno[1] + "\t" + anno[2] + "\t" + anno[3] + "\t" + anno[4]);
				}
			} else if (line.length() == 0) {
				BioCDocument biocDocument = new BioCDocument();
				biocDocument.setID(Pmid);
				int startoffset = 0;
				for (int i = 0; i < ParagraphType.size(); i++) {
					BioCPassage biocPassage = new BioCPassage();
					Map Infons = new HashMap();
					Infons.put("type", ParagraphType.get(i));
					biocPassage.setInfons(Infons);
					biocPassage.setText(ParagraphContent.get(i));
					biocPassage.setOffset(startoffset);
					startoffset = startoffset + ParagraphContent.get(i).length() + 1;
					for (int j = 0; j < annotations.size(); j++) {
						String anno[] = annotations.get(j).split("\t");
						if (Integer.parseInt(anno[0]) < startoffset && Integer.parseInt(anno[0]) >= startoffset - ParagraphContent.get(i).length() - 1) {
							BioCAnnotation biocAnnotation = new BioCAnnotation();
							Map AnnoInfons = new HashMap();
							if (anno.length == 5) {
								AnnoInfons.put("Identifier", anno[4]);
							}
							AnnoInfons.put("type", anno[3]);
							biocAnnotation.setInfons(AnnoInfons);
							BioCLocation location = new BioCLocation(Integer.parseInt(anno[0]), Integer.parseInt(anno[1]) - Integer.parseInt(anno[0]));
							biocAnnotation.setLocations(Set.of(location));
							biocAnnotation.setText(anno[2]);
							biocPassage.addAnnotation(biocAnnotation);
						}
					}
					biocDocument.addPassage(biocPassage);
				}
				biocCollection.addDocument(biocDocument);
				ParagraphType.clear();
				ParagraphContent.clear();
				annotations.clear();
			}
		}
		bioCCollectionWriter.writeCollection(biocCollection);
		bioCCollectionWriter.close();
		inputfile.close();
	}

	public void BioC2PubTator(String input, String output) throws IOException, XMLStreamException {
		HashMap pmidlist = new HashMap();
		boolean duplicate = false;
		BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
		BioCCollection collection = new BioCCollection();
		final BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
		final Iterator docIt = bioCCollectionReader.readCollection().documentIterator();
		while (docIt.hasNext()) {
			BioCDocument document = docIt.next();
			String PMID = document.getID();
			if (pmidlist.containsKey(PMID)) {
				System.out.println("\nError: duplicate pmid-" + PMID);
				duplicate = true;
			} else {
				pmidlist.put(PMID, "");
			}
			String Anno = "";
			for (BioCPassage passage : document.getPassages()) {
				if (passage.getInfon("type").get().equals("title")) {
					PubTatorOutputFormat.write(PMID + "|t|" + passage.getText().get() + "\n");
				} else if (passage.getInfon("type").get().equals("abstract")) {
					PubTatorOutputFormat.write(PMID + "|a|" + passage.getText().get() + "\n");
				} else {
					PubTatorOutputFormat.write(PMID + "|" + passage.getInfon("type").get() + "|" + passage.getText().get() + "\n");
				}
				for (BioCAnnotation annotation : passage.getAnnotations()) {
					String Annotype = annotation.getInfon("type").get();
					String Annoid = "";
					String Proteinid = "";
					if (Annotype.matches("(Gene|FamilyName|DomainMotif)")) {
						if (annotation.getInfons().containsKey("NCBI Gene")) {
							Annoid = annotation.getInfon("NCBI Gene").get();
							String Annoidlist[] = Annoid.split(";");
							Annoid = "";
							for (int x = 0; x < Annoidlist.length; x++) {
								String proteinid = "";
								String homoid = "";
								if (GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x])) {
									proteinid = GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
								}
								if (GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x])) {
									homoid = GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
								}
								if ((!proteinid.equals("")) || (!homoid.equals(""))) {
									if (Annoid.equals("")) {
										Annoid = Annoidlist[x] + "(";
										if (!proteinid.equals("")) {
											Annoid = Annoid + "UniProt:" + proteinid;
										}
										if (!homoid.equals("")) {
											if (!proteinid.equals("")) {
												Annoid = Annoid + ";";
											}
											Annoid = Annoid + "Homoid:" + homoid;
										}
										Annoid = Annoid + ")";
									} else {
										Annoid = Annoid + ";" + Annoidlist[x] + "(";
										if (!proteinid.equals("")) {
											Annoid = Annoid + "UniProt:" + proteinid;
										}
										if (!homoid.equals("")) {
											if (!proteinid.equals("")) {
												Annoid = Annoid + ";";
											}
											Annoid = Annoid + "Homoid:" + homoid;
										}
										Annoid = Annoid + ")";
									}
								} else {
									if (Annoid.equals("")) {
										Annoid = Annoidlist[x];
									} else {
										Annoid = Annoid + ";" + Annoidlist[x];
									}
								}
							}
						} else {
							Annoid = annotation.getInfon("Identifier").orElse(null);
						}
					} else if (Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain")) {
						if (annotation.getInfons().containsKey("NCBI Taxonomy")) {
							Annoid = annotation.getInfon("NCBI Taxonomy").get();
						} else {
							Annoid = annotation.getInfon("Identifier").get();
						}
					} else if (Annotype.equals("CellLine")) {
						if (annotation.getInfons().containsKey("NCBI Taxonomy")) {
							Annoid = annotation.getInfon("NCBI Taxonomy").get();
						} else {
							Annoid = annotation.getInfon("Identifier").get();
						}
					} else {
						Annoid = annotation.getInfon("Identifier").get();
					}
					int start = annotation.getTotalLocation().getOffset();
					int last = start + annotation.getTotalLocation().getLength();
					String AnnoMention = annotation.getText().get();
					Anno = Anno + PMID + "\t" + start + "\t" + last + "\t" + AnnoMention + "\t" + Annotype + "\t" + Annoid + "\n";
				}
			}
			PubTatorOutputFormat.write(Anno + "\n");
		}
		PubTatorOutputFormat.close();
		if (duplicate == true) {
			System.exit(0);
		}
	}

	public void BioC2PubTator(String original_input, String input, String output) throws IOException, XMLStreamException {
		BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(original_input), "UTF-8"));
		HashMap ParagraphContent = new HashMap();
		HashMap annotations = new HashMap();
		String line;
		String Pmid = "";
		int count_paragraph = 0;
		while ((line = inputfile.readLine()) != null) {
			if (line.contains("|") && !line.contains("\t")) {
				String str[] = line.split("\\|", -1);
				Pmid = str[0];
				ParagraphContent.put(Pmid + "\t" + str[1], str[2]);
				count_paragraph++;
			} else if (line.contains("\t")) {
				annotations.put(Pmid, annotations.get(Pmid) + line);
			} else if (line.length() == 0) {
				count_paragraph = 0;
			}
		}
		inputfile.close();
		HashMap pmidlist = new HashMap();
		boolean duplicate = false;
		BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
		BioCCollection collection = new BioCCollection();
		final BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
		final Iterator docIt = bioCCollectionReader.readCollection().documentIterator();
		while (docIt.hasNext()) {
			BioCDocument document = docIt.next();
			String PMID = document.getID();
			if (pmidlist.containsKey(PMID)) {
				System.out.println("\nError: duplicate pmid-" + PMID);
				duplicate = true;
			} else {
				pmidlist.put(PMID, "");
			}
			String Anno = "";
			for (BioCPassage passage : document.getPassages()) {
				if (passage.getInfon("type").get().equals("title") || passage.getInfon("type").get().equals("t")) {
					PubTatorOutputFormat.write(PMID + "|t|" + ParagraphContent.get(PMID + "\tt") + "\n");
				} else if (passage.getInfon("type").get().equals("abstract") || passage.getInfon("type").get().equals("a")) {
					PubTatorOutputFormat.write(PMID + "|a|" + ParagraphContent.get(PMID + "\ta") + "\n");
				} else {
					PubTatorOutputFormat.write(PMID + "|" + passage.getInfon("type").get() + "|" + passage.getText().get() + "\n");
				}
				for (BioCAnnotation annotation : passage.getAnnotations()) {
					String Annotype = annotation.getInfon("type").get();
					String Annoid = "";
					String Proteinid = "";
					if (Annotype.matches("(Gene|FamilyName|DomainMotif)")) {
						if (annotation.getInfons().containsKey("NCBI Gene")) {
							Annoid = annotation.getInfon("NCBI Gene").get();
							String Annoidlist[] = Annoid.split(";");
							Annoid = "";
							for (int x = 0; x < Annoidlist.length; x++) {
								String proteinid = "";
								String homoid = "";
								if (GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x])) {
									proteinid = GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
								}
								if (GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x])) {
									homoid = GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
								}
								if ((!proteinid.equals("")) || (!homoid.equals(""))) {
									if (Annoid.equals("")) {
										Annoid = Annoidlist[x] + "(";
										if (!proteinid.equals("")) {
											Annoid = Annoid + "UniProt:" + proteinid;
										}
										if (!homoid.equals("")) {
											if (!proteinid.equals("")) {
												Annoid = Annoid + ";";
											}
											Annoid = Annoid + "Homoid:" + homoid;
										}
										Annoid = Annoid + ")";
									} else {
										Annoid = Annoid + ";" + Annoidlist[x] + "(";
										if (!proteinid.equals("")) {
											Annoid = Annoid + "UniProt:" + proteinid;
										}
										if (!homoid.equals("")) {
											if (!proteinid.equals("")) {
												Annoid = Annoid + ";";
											}
											Annoid = Annoid + "Homoid:" + homoid;
										}
										Annoid = Annoid + ")";
									}
								} else {
									if (Annoid.equals("")) {
										Annoid = Annoidlist[x];
									} else {
										Annoid = Annoid + ";" + Annoidlist[x];
									}
								}
							}
						} else {
							Annoid = annotation.getInfon("Identifier").get();
						}
					} else if (Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain")) {
						if (annotation.getInfons().containsKey("NCBI Taxonomy")) {
							Annoid = annotation.getInfon("NCBI Taxonomy").get();
						} else {
							Annoid = annotation.getInfon("Identifier").get();
						}
					} else if (Annotype.equals("CellLine")) {
						if (annotation.getInfons().containsKey("NCBI Taxonomy")) {
							Annoid = annotation.getInfon("NCBI Taxonomy").get();
						} else {
							Annoid = annotation.getInfon("Identifier").get();
						}
					} else {
						if (annotation.getInfons().containsKey("Identifier")) {
							Annoid = annotation.getInfon("Identifier").get();
						} else {
							Annoid = "";
						}
					}
					int start = annotation.getTotalLocation().getOffset();
					int last = start + annotation.getTotalLocation().getLength();
					String AnnoMention = annotation.getText().get();
					if (Annoid != null && !Annoid.equals(null)) {
						Anno = Anno + PMID + "\t" + start + "\t" + last + "\t" + AnnoMention + "\t" + Annotype + "\t" + Annoid + "\n";
					} else {
						Anno = Anno + PMID + "\t" + start + "\t" + last + "\t" + AnnoMention + "\t" + Annotype + "\n";
					}
				}
			}
			PubTatorOutputFormat.write(Anno + "\n");
		}
		PubTatorOutputFormat.close();
		if (duplicate == true) {
			System.exit(0);
		}
	}

	public void BioCReader(String input) throws IOException, XMLStreamException {
		BioCCollection collection = new BioCCollection();
		final BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
		final Iterator docIt = bioCCollectionReader.readCollection().documentIterator();
		while (docIt.hasNext()) {
			BioCDocument document = docIt.next();
			PMIDs.add(document.getID());
			ArrayList PassageName = new ArrayList();
			ArrayList PassageOffset = new ArrayList();
			ArrayList PassageContext = new ArrayList();
			ArrayList> AnnotationInPMID = new ArrayList();
			for (BioCPassage passage : document.getPassages()) {
				PassageName.add(passage.getInfon("type").get());
				String txt = passage.getText().get();
				if (txt.matches("[\t ]+")) {
					txt = txt.replaceAll(".", "@");
				} else {
					txt = txt.replaceAll("ω", "w");
					txt = txt.replaceAll("μ", "u");
					txt = txt.replaceAll("κ", "k");
					txt = txt.replaceAll("α", "a");
					txt = txt.replaceAll("γ", "g");
					txt = txt.replaceAll("ɣ", "g");
					txt = txt.replaceAll("β", "b");
					txt = txt.replaceAll("×", "x");
					txt = txt.replaceAll("‑", "-");
					txt = txt.replaceAll("¹", "1");
					txt = txt.replaceAll("²", "2");
					txt = txt.replaceAll("°", "o");
					txt = txt.replaceAll("ö", "o");
					txt = txt.replaceAll("é", "e");
					txt = txt.replaceAll("à", "a");
					txt = txt.replaceAll("Á", "A");
					txt = txt.replaceAll("ε", "e");
					txt = txt.replaceAll("θ", "O");
					txt = txt.replaceAll("•", ".");
					txt = txt.replaceAll("µ", "u");
					txt = txt.replaceAll("λ", "r");
					txt = txt.replaceAll("⁺", "+");
					txt = txt.replaceAll("ν", "v");
					txt = txt.replaceAll("ï", "i");
					txt = txt.replaceAll("ã", "a");
					txt = txt.replaceAll("≡", "=");
					txt = txt.replaceAll("ó", "o");
					txt = txt.replaceAll("³", "3");
					txt = txt.replaceAll("〖", "[");
					txt = txt.replaceAll("〗", "]");
					txt = txt.replaceAll("Å", "A");
					txt = txt.replaceAll("ρ", "p");
					txt = txt.replaceAll("ü", "u");
					txt = txt.replaceAll("ɛ", "e");
					txt = txt.replaceAll("č", "c");
					txt = txt.replaceAll("š", "s");
					txt = txt.replaceAll("ß", "b");
					txt = txt.replaceAll("═", "=");
					txt = txt.replaceAll("£", "L");
					txt = txt.replaceAll("Ł", "L");
					txt = txt.replaceAll("ƒ", "f");
					txt = txt.replaceAll("ä", "a");
					txt = txt.replaceAll("–", "-");
					txt = txt.replaceAll("⁻", "-");
					txt = txt.replaceAll("〈", "<");
					txt = txt.replaceAll("〉", ">");
					txt = txt.replaceAll("χ", "X");
					txt = txt.replaceAll("Đ", "D");
					txt = txt.replaceAll("‰", "%");
					txt = txt.replaceAll("·", ".");
					txt = txt.replaceAll("→", ">");
					txt = txt.replaceAll("←", "<");
					txt = txt.replaceAll("ζ", "z");
					txt = txt.replaceAll("π", "p");
					txt = txt.replaceAll("τ", "t");
					txt = txt.replaceAll("ξ", "X");
					txt = txt.replaceAll("η", "h");
					txt = txt.replaceAll("ø", "0");
					txt = txt.replaceAll("Δ", "D");
					txt = txt.replaceAll("∆", "D");
					txt = txt.replaceAll("∑", "S");
					txt = txt.replaceAll("Ω", "O");
					txt = txt.replaceAll("δ", "d");
					txt = txt.replaceAll("σ", "s");
					txt = txt.replaceAll("Φ", "F");
				}
				if (passage.getText().get().equals("") || passage.getText().get().matches("[ ]+")) {
					PassageContext.add("-notext-");
				} else {
					PassageContext.add(txt);
				}
				PassageOffset.add(passage.getOffset());
				ArrayList AnnotationInPassage = new ArrayList();
				AnnotationInPMID.add(AnnotationInPassage);
			}
			PassageNames.add(PassageName);
			PassageContexts.add(PassageContext);
			PassageOffsets.add(PassageOffset);
			Annotations.add(AnnotationInPMID);
		}
	}

	public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException {
		BioCCollection collection = new BioCCollection();
		final BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
		final Iterator docIt = bioCCollectionReader.readCollection().documentIterator();
		while (docIt.hasNext()) {
			BioCDocument document = docIt.next();
			PMIDs.add(document.getID());
			ArrayList PassageName = new ArrayList();
			ArrayList PassageOffset = new ArrayList();
			ArrayList PassageContext = new ArrayList();
			ArrayList> AnnotationInPMID = new ArrayList();
			for (BioCPassage passage : document.getPassages()) {
				PassageName.add(passage.getInfon("type").get());
				String txt = passage.getText().get();
				if (txt.matches("[\t ]+")) {
					txt = txt.replaceAll(".", "@");
				} else {
					txt = txt.replaceAll("ω", "w");
					txt = txt.replaceAll("μ", "u");
					txt = txt.replaceAll("κ", "k");
					txt = txt.replaceAll("α", "a");
					txt = txt.replaceAll("γ", "g");
					txt = txt.replaceAll("ɣ", "g");
					txt = txt.replaceAll("β", "b");
					txt = txt.replaceAll("×", "x");
					txt = txt.replaceAll("‑", "-");
					txt = txt.replaceAll("¹", "1");
					txt = txt.replaceAll("²", "2");
					txt = txt.replaceAll("°", "o");
					txt = txt.replaceAll("ö", "o");
					txt = txt.replaceAll("é", "e");
					txt = txt.replaceAll("à", "a");
					txt = txt.replaceAll("Á", "A");
					txt = txt.replaceAll("ε", "e");
					txt = txt.replaceAll("θ", "O");
					txt = txt.replaceAll("•", ".");
					txt = txt.replaceAll("µ", "u");
					txt = txt.replaceAll("λ", "r");
					txt = txt.replaceAll("⁺", "+");
					txt = txt.replaceAll("ν", "v");
					txt = txt.replaceAll("ï", "i");
					txt = txt.replaceAll("ã", "a");
					txt = txt.replaceAll("≡", "=");
					txt = txt.replaceAll("ó", "o");
					txt = txt.replaceAll("³", "3");
					txt = txt.replaceAll("〖", "[");
					txt = txt.replaceAll("〗", "]");
					txt = txt.replaceAll("Å", "A");
					txt = txt.replaceAll("ρ", "p");
					txt = txt.replaceAll("ü", "u");
					txt = txt.replaceAll("ɛ", "e");
					txt = txt.replaceAll("č", "c");
					txt = txt.replaceAll("š", "s");
					txt = txt.replaceAll("ß", "b");
					txt = txt.replaceAll("═", "=");
					txt = txt.replaceAll("£", "L");
					txt = txt.replaceAll("Ł", "L");
					txt = txt.replaceAll("ƒ", "f");
					txt = txt.replaceAll("ä", "a");
					txt = txt.replaceAll("–", "-");
					txt = txt.replaceAll("⁻", "-");
					txt = txt.replaceAll("〈", "<");
					txt = txt.replaceAll("〉", ">");
					txt = txt.replaceAll("χ", "X");
					txt = txt.replaceAll("Đ", "D");
					txt = txt.replaceAll("‰", "%");
					txt = txt.replaceAll("·", ".");
					txt = txt.replaceAll("→", ">");
					txt = txt.replaceAll("←", "<");
					txt = txt.replaceAll("ζ", "z");
					txt = txt.replaceAll("π", "p");
					txt = txt.replaceAll("τ", "t");
					txt = txt.replaceAll("ξ", "X");
					txt = txt.replaceAll("η", "h");
					txt = txt.replaceAll("ø", "0");
					txt = txt.replaceAll("Δ", "D");
					txt = txt.replaceAll("∆", "D");
					txt = txt.replaceAll("∑", "S");
					txt = txt.replaceAll("Ω", "O");
					txt = txt.replaceAll("δ", "d");
					txt = txt.replaceAll("σ", "s");
					txt = txt.replaceAll("Φ", "F");
				}
				if (passage.getText().get().equals("") || passage.getText().get().matches("[ ]+")) {
					PassageContext.add("-notext-");
				} else {
					PassageContext.add(txt);
				}
				PassageOffset.add(passage.getOffset());
				ArrayList AnnotationInPassage = new ArrayList();
				for (BioCAnnotation Anno : passage.getAnnotations()) {
					int start = Anno.getTotalLocation().getOffset() - passage.getOffset();
					int last = start + Anno.getTotalLocation().getLength();
					String AnnoMention = Anno.getText().get();
					String Annotype = Anno.getInfon("type").get();
					String Annoid = Anno.getInfon("Identifier").orElse(null);
					if (Annoid == null) {
						Annoid = Anno.getInfon("Identifier").orElse(null);
					}
					if (Annoid == null || Annoid.equals("null")) {
						AnnotationInPassage.add(start + "\t" + last + "\t" + AnnoMention + "\t" + Annotype);
					} else {
						AnnotationInPassage.add(start + "\t" + last + "\t" + AnnoMention + "\t" + Annotype + "\t" + Annoid);
					}
				}
				AnnotationInPMID.add(AnnotationInPassage);
			}
			PassageNames.add(PassageName);
			PassageContexts.add(PassageContext);
			PassageOffsets.add(PassageOffset);
			Annotations.add(AnnotationInPMID);
		}
	}

	public void BioCOutput(String input, String output, ArrayList>> Annotations, boolean Final, boolean RemovePreviousAnno) throws IOException, XMLStreamException {
		boolean ShowUnNormalizedMention = false;
		if (GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True")) {
			ShowUnNormalizedMention = true;
		}
		final BioCDocumentWriter BioCOutputFormat = new BioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
		BioCCollection biocCollection_input = new BioCCollection();
		BioCCollection biocCollection_output = new BioCCollection();
		biocCollection_input = new BioCCollectionReader(new InputStreamReader(new FileInputStream(input), "UTF-8")).readCollection();
		BioCOutputFormat.writeBeginCollectionInfo(biocCollection_input);
		int i = 0;
		final Iterator docIt = biocCollection_input.documentIterator();
		while (docIt.hasNext()) {
			BioCDocument document_output = new BioCDocument();
			final BioCDocument document_input = docIt.next();
			String PMID = document_input.getID();
			document_output.setID(PMID);
			int annotation_count = 0;
			int j = 0;
			for (BioCPassage passage_input : document_input.getPassages()) {
				BioCPassage passage_output = passage_input;
				if (RemovePreviousAnno == true) {
					passage_output.clearAnnotations();
				} else {
					for (BioCAnnotation annotation : passage_output.getAnnotations()) {
						annotation.setID("" + annotation_count);
						annotation_count++;
					}
				}
				int passage_Offset = passage_input.getOffset();
				String passage_Text = passage_input.getText().get();
				ArrayList AnnotationInPassage = new ArrayList();
				if (Annotations.size() > i && Annotations.get(i).size() > j) {
					for (int a = 0; a < Annotations.get(i).get(j).size(); a++) {
						String Anno[] = Annotations.get(i).get(j).get(a).split("\\t");
						int start = Integer.parseInt(Anno[0]);
						int last = Integer.parseInt(Anno[1]);
						boolean found = false;
						if (passage_Text.length() > last) {
							String mention = Anno[2];
							if (Final == true && passage_Text.length() >= last) {
								mention = passage_Text.substring(start, last);
							}
							if (mention.matches(".*\t.*")) {
								Anno[3] = Anno[4];
								if (Anno.length >= 6) {
									Anno[4] = Anno[5];
								}
							}
							String type = Anno[3];
							String id = "";
							if (Anno.length >= 5) {
								id = Anno[4];
							}
							if (Final == true) {
								for (int b = 0; b < AnnotationInPassage.size(); b++) {
									String Annob[] = AnnotationInPassage.get(b).split("\\t");
									int startb = Integer.parseInt(Annob[0]);
									int lastb = Integer.parseInt(Annob[1]);
									String mentionb = Annob[2];
									if (Final == true && passage_Text.length() >= lastb) {
										mentionb = passage_Text.substring(startb, lastb);
									}
									if (mentionb.matches(".*\t.*")) {
										Annob[3] = Annob[4];
										if (Annob.length >= 6) {
											Annob[4] = Annob[5];
										}
									}
									String typeb = Annob[3];
									String idb = "";
									if (Annob.length >= 5) {
										idb = Annob[4];
									}
									if (start == startb && last == lastb && type.equals(typeb)) {
										found = true;
										if (id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals(""))) {
										} else if (idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals(""))) {
											AnnotationInPassage.set(b, start + "\t" + last + "\t" + mention + "\t" + type + "\t" + id);
										} else {
											if (id.equals("")) {
											} else {
												AnnotationInPassage.set(b, start + "\t" + last + "\t" + mention + "\t" + type + "\t" + idb + ";" + id);
											}
										}
										break;
									}
								}
							}
						}
						if (found == false) {
							AnnotationInPassage.add(Annotations.get(i).get(j).get(a));
						}
					}
				}
				for (int a = 0; a < AnnotationInPassage.size(); a++) {
					String Anno[] = AnnotationInPassage.get(a).split("\\t");
					HashMap id_hash = new HashMap();
					if (Anno.length >= 5) {
						int start = Integer.parseInt(Anno[0]);
						int last = Integer.parseInt(Anno[1]);
						String mention = Anno[2];
						if (Final == true && passage_Text.length() >= last) {
							mention = passage_Text.substring(start, last);
						}
						if (mention.matches(".*\t.*")) {
							Anno[3] = Anno[4];
							if (Anno.length >= 6) {
								Anno[4] = Anno[5];
							}
						}
						String ids = Anno[4];
						String idlist[] = ids.split(",");
						for (int b = 0; b < idlist.length; b++) {
							id_hash.put(idlist[b], "");
						}
						ids = "";
						for (String id : id_hash.keySet()) {
							if (ids.equals("")) {
								ids = id;
							} else {
								ids = ids + ";" + id;
							}
						}
						AnnotationInPassage.set(a, Anno[0] + "\t" + Anno[1] + "\t" + Anno[2] + "\t" + Anno[3] + "\t" + ids);
					}
				}
				for (int a = 0; a < AnnotationInPassage.size(); a++) {
					String Anno[] = AnnotationInPassage.get(a).split("\\t");
					int start = Integer.parseInt(Anno[0]);
					int last = Integer.parseInt(Anno[1]);
					if (passage_Text.length() > last) {
						String mention = Anno[2];
						if (Final == true && passage_Text.length() >= last) {
							mention = passage_Text.substring(start, last);
						}
						if (mention.matches(".*\t.*")) {
							Anno[3] = Anno[4];
							if (Anno.length >= 6) {
								Anno[4] = Anno[5];
							}
						}
						if (Anno.length < 4)
							throw new IllegalStateException("Document with ID " + PMID + " has annotation \"" + Arrays.toString(Anno) + "\" which is too short: A minimal length of 4 is expected.");
						String type = Anno[3];
						if (type.equals("GeneID")) {
							type = "Gene";
						}
						BioCAnnotation biocAnnotation = new BioCAnnotation();
						Map AnnoInfons = new HashMap();
						AnnoInfons.put("type", type);
						if (Anno.length >= 5) {
							String identifier = Anno[4];
							if (Final == true && ShowUnNormalizedMention == false) {
								if (type.matches("(FamilyName|Domain|Gene)")) {
									Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
									Matcher mtmp0 = ptmp0.matcher(identifier);
									Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
									Matcher mtmp1 = ptmp1.matcher(identifier);
									Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$");
									Matcher mtmp2 = ptmp2.matcher(identifier);
									Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$");
									Matcher mtmp3 = ptmp3.matcher(identifier);
									if (mtmp0.find()) {
										String Method_SA = mtmp0.group(1);
										String TaxonomyID = mtmp0.group(2);
										String NCBIGeneID = mtmp0.group(3);
										if (GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)) {
											AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
										}
										if (GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)) {
											AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
										}
										AnnoInfons.put("NCBI Gene", NCBIGeneID);
									} else if (mtmp1.find()) {
										String Method_SA = mtmp1.group(1);
										String TaxonomyID = mtmp1.group(2);
										String NCBIGeneID = mtmp1.group(3);
										String HomoID = mtmp1.group(4);
										if (GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)) {
											AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
										}
										if (GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)) {
											AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
										}
										AnnoInfons.put("NCBI Gene", NCBIGeneID);
									} else if (mtmp2.find()) {
										String Method_SA = mtmp2.group(1);
										String TaxonomyID = mtmp2.group(2);
										AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:" + TaxonomyID);
									} else if (mtmp3.find()) {
										String Method_SA = mtmp3.group(1);
										String HomoID = mtmp3.group(2);
										AnnoInfons.put("NCBI Homologene", HomoID);
									} else {
										String identifiers[] = identifier.split(";");
										if (identifiers.length > 1) {
											ArrayList identifierSTR = new ArrayList();
											ArrayList ProteinidSTR = new ArrayList();
											ArrayList HomoidSTR = new ArrayList();
											for (int idi = 0; idi < identifiers.length; idi++) {
												Pattern ptmp4 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
												Matcher mtmp4 = ptmp4.matcher(identifiers[idi]);
												Pattern ptmp5 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
												Matcher mtmp5 = ptmp5.matcher(identifiers[idi]);
												if (mtmp4.find()) {
													String Method_SA = mtmp4.group(1);
													String TaxonomyID = mtmp4.group(2);
													String NCBIGeneID = mtmp4.group(3);
													String HomoID = mtmp4.group(4);
													if (!identifierSTR.contains(NCBIGeneID)) {
														identifierSTR.add(NCBIGeneID);
													}
													if (GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)) {
														if (!ProteinidSTR.contains(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))) {
															ProteinidSTR.add(GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
														}
													}
													if (GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)) {
														if (!HomoidSTR.contains(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))) {
															HomoidSTR.add(GNormPlus.HomologeneID_hash.get(NCBIGeneID));
														}
													}
												} else if (mtmp5.find()) {
													String Method_SA = mtmp5.group(1);
													String TaxonomyID = mtmp5.group(2);
													String NCBIGeneID = mtmp5.group(3);
													if (!identifierSTR.contains(NCBIGeneID)) {
														identifierSTR.add(NCBIGeneID);
													}
												}
											}
											String idSTR = "";
											for (int x = 0; x < identifierSTR.size(); x++) {
												if (idSTR.equals("")) {
													idSTR = identifierSTR.get(x);
												} else {
													idSTR = idSTR + ";" + identifierSTR.get(x);
												}
											}
											AnnoInfons.put("NCBI Gene", idSTR);
											String pidSTR = "";
											for (int x = 0; x < ProteinidSTR.size(); x++) {
												if (pidSTR.equals("")) {
													pidSTR = ProteinidSTR.get(x);
												} else {
													pidSTR = pidSTR + ";" + ProteinidSTR.get(x);
												}
											}
											if (!pidSTR.equals("")) {
												AnnoInfons.put("UniProt", pidSTR);
											}
											String hidSTR = "";
											for (int x = 0; x < HomoidSTR.size(); x++) {
												if (hidSTR.equals("")) {
													hidSTR = HomoidSTR.get(x);
												} else {
													hidSTR = hidSTR + ";" + HomoidSTR.get(x);
												}
											}
											if (!hidSTR.equals("")) {
												AnnoInfons.put("NCBI Homologene", hidSTR);
											}
										}
									}
								} else if (type.matches("(Species|Genus|Strain)")) {
									AnnoInfons.put("type", type);
									AnnoInfons.put("NCBI Taxonomy", identifier);
								} else if (type.matches("Cell")) {
									AnnoInfons.put("type", "CellLine");
									AnnoInfons.put("NCBI Taxonomy", identifier);
								} else {
									AnnoInfons.put("Identifier", identifier);
								}
							} else {
								AnnoInfons.put("Identifier", identifier);
							}
						}
						biocAnnotation.setInfons(AnnoInfons);
						BioCLocation location = new BioCLocation(start + passage_Offset, last - start);
						biocAnnotation.setLocations(Set.of(location));
						biocAnnotation.setText(mention);
						biocAnnotation.setID("" + annotation_count);
						annotation_count++;
						if (Final == true) {
							// "FamilyName" added by Erik Faessler, JULIE Lab: Output FamilyNames
							if (AnnoInfons.containsKey("Identifier") || AnnoInfons.containsKey("NCBI Homologene") || AnnoInfons.containsKey("NCBI Gene") || AnnoInfons.containsKey("NCBI Taxonomy") || type.equals("FamilyName")) {
								passage_output.addAnnotation(biocAnnotation);
							}
						} else {
							passage_output.addAnnotation(biocAnnotation);
						}
					}
				}
				document_output.addPassage(passage_output);
				j++;
			}
			biocCollection_output.addDocument(document_output);
			BioCOutputFormat.writeDocument(document_output);
			i++;
		}
		BioCOutputFormat.close();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy