All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.journal.TEIDOMParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.journal; import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; public class TEIDOMParser { public TEIDOMParser() { } public Metadata parse(String source, ParseContext parseContext) throws TikaException, SAXException, IOException { Document root = XMLReaderUtils.buildDOM( new ByteArrayInputStream(source.getBytes(StandardCharsets.UTF_8)), parseContext); Metadata metadata = new Metadata(); createGrobidMetadata(source, root.getDocumentElement(), metadata); return metadata; } private void createGrobidMetadata(String source, Element root, Metadata metadata) { if (root != null) { Node text = getFirstChild(root.getChildNodes(), "text"); if (text != null) { parseText(text, metadata); } Node teiHeader = getFirstChild(root.getChildNodes(), "teiHeader"); Node fileDesc = getFirstChild(teiHeader.getChildNodes(), "fileDesc"); if (fileDesc != null) { parseFileDesc(fileDesc, metadata); } Node profileDesc = getFirstChild(teiHeader.getChildNodes(), "profileDesc"); if (profileDesc != null) { parseProfileDesc(profileDesc, metadata); } } addStaticMet(source, root, metadata); } private void addStaticMet(String source, Element obj, Metadata metadata) { metadata.add("Class", Metadata.class.getName()); //no longer available after we got rid of json.org's and its .toJSONObject() // metadata.add("TEIJSONSource", obj.toString()); metadata.add("TEIXMLSource", source); } private void parseText(Node text, Metadata metadata) { String lang = getFirstAttribute(text, "xml", "lang"); if (lang != null) { metadata.add("Language", lang); } } private void parseFileDesc(Node fileDesc, Metadata metadata) { Node titleStmt = getFirstChild(fileDesc.getChildNodes(), "titleStmt"); if (titleStmt != null) { parseTitleStmt(titleStmt, metadata); } Node sourceDesc = getFirstChild(fileDesc.getChildNodes(), "sourceDesc"); if (sourceDesc != null) { parseSourceDesc(sourceDesc, metadata); } } private void parseTitleStmt(Node titleStmt, Metadata metadata) { Node title = getFirstChild(titleStmt.getChildNodes(), "title"); if (title != null) { String titleText = title.getTextContent(); if (titleText != null) { metadata.add("Title", titleText); } } } private void parseSourceDesc(Node sourceDesc, Metadata metadata) { Node biblStruct = getFirstChild(sourceDesc.getChildNodes(), "biblStruct"); if (biblStruct != null) { parseBiblStruct(biblStruct, metadata); } } private void parseBiblStruct(Node biblStruct, Metadata metadata) { Node analytic = getFirstChild(biblStruct.getChildNodes(), "analytic"); if (analytic != null) { List authorNodes = getChildNodes(analytic.getChildNodes(), "author"); List authorList = new ArrayList<>(); for (Node authorNode : authorNodes) { parseAuthor(authorNode, authorList); } metadata.add("Address", getMetadataAddresses(authorList)); metadata.add("Affiliation", getMetadataAffiliations(authorList)); metadata.add("Authors", getMetadataAuthors(authorList)); metadata.add("FullAffiliations", getMetadataFullAffiliations(authorList)); } else { metadata.add("Error", "Unable to parse: no analytic section in JSON"); } } private String getMetadataFullAffiliations(List authorList) { List unique = new ArrayList(); StringBuilder metAffils = new StringBuilder(); for (Author a : authorList) { for (Affiliation af : a.getAffiliations()) { if (!unique.contains(af)) { unique.add(af); } } } metAffils.append("["); for (Affiliation af : unique) { metAffils.append(af.toString()); metAffils.append(","); } metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1)); metAffils.append("]"); return metAffils.toString(); } private String getMetadataAuthors(List authorList) { // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2 // Steve Hughes 1 List unique = new ArrayList(); StringBuilder metAuthors = new StringBuilder(); for (Author a : authorList) { for (Affiliation af : a.getAffiliations()) { if (!unique.contains(af)) { unique.add(af); } } } for (Author a : authorList) { metAuthors.append(printOrBlank(a.getFirstName())); metAuthors.append(printOrBlank(a.getMiddleName())); metAuthors.append(printOrBlank(a.getSurName())); StringBuilder affilBuilder = new StringBuilder(); for (int idx = 0; idx < unique.size(); idx++) { Affiliation af = unique.get(idx); if (a.getAffiliations().contains(af)) { affilBuilder.append((idx + 1)); affilBuilder.append(","); } } if (affilBuilder.length() > 0) affilBuilder.deleteCharAt(affilBuilder.length() - 1); metAuthors.append(affilBuilder.toString()); metAuthors.append(" "); } return metAuthors.toString(); } private String getMetadataAffiliations(List authorList) { // generates 1 Jet Propulsion Laboratory California Institute of Technology // ; 2 Computer Science Department University of Southern California List unique = new ArrayList(); StringBuilder metAffil = new StringBuilder(); for (Author a : authorList) { for (Affiliation af : a.getAffiliations()) { if (!unique.contains(af)) { unique.add(af); } } } int count = 1; for (Affiliation a : unique) { metAffil.append(count); metAffil.append(" "); metAffil.append(a.getOrgName().toString()); metAffil.deleteCharAt(metAffil.length() - 1); metAffil.append("; "); count++; } if (count > 1) { metAffil.deleteCharAt(metAffil.length() - 1); metAffil.deleteCharAt(metAffil.length() - 1); } return metAffil.toString(); } private String getMetadataAddresses(List authorList) { // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA", List

unique = new ArrayList
(); StringBuilder metAddress = new StringBuilder(); for (Author a : authorList) { for (Affiliation af : a.getAffiliations()) { if (!unique.contains(af.getAddress())) { unique.add(af.getAddress()); } } } for (Address ad : unique) { metAddress.append(ad.toString()); metAddress.append(" "); } return metAddress.toString(); } private void parseAuthor(Node authorNode, List authorList) { Author author = new Author(); Node persName = getFirstChild(authorNode.getChildNodes(), "persName"); if (persName != null) { List forenames = getChildNodes(persName.getChildNodes(), "forename"); for (Node forenameNode : forenames) { parseNamePart(forenameNode, author); } Node surnameNode = getFirstChild(persName.getChildNodes(), "surname"); if (surnameNode != null) { String surnameContent = surnameNode.getTextContent(); if (surnameContent != null) { author.setSurName(surnameContent); } } } List affiliationNodes = getChildNodes(authorNode.getChildNodes(), "affiliation"); for (Node affiliationNode : affiliationNodes) { parseOneAffiliation(affiliationNode, author); } authorList.add(author); } private void parseNamePart(Node namePart, Author author) { String type = getFirstAttribute(namePart, null, "type"); String content = namePart.getTextContent(); if (type != null && content != null) { if (type.equals("first")) { author.setFirstName(content); } if (type.equals("middle")) { author.setMiddleName(content); } } } private void parseOneAffiliation(Node affiliationNode, Author author) { Affiliation affiliation = new Affiliation(); Node address = getFirstChild(affiliationNode.getChildNodes(), "address"); if (address != null) { parseAddress(address, affiliation); } List orgNameNodes = getChildNodes(affiliationNode.getChildNodes(), "orgName"); OrgName orgName = new OrgName(); for (Node orgNameNode : orgNameNodes) { parseOrgName(orgNameNode, orgName); } affiliation.setOrgName(orgName); author.getAffiliations().add(affiliation); } private void parseAddress(Node addressNode, Affiliation affiliation) { Address address = new Address(); Node region = getFirstChild(addressNode.getChildNodes(), "region"); if (region != null && region.getTextContent() != null) { address.setRegion(region.getTextContent()); } Node postCode = getFirstChild(addressNode.getChildNodes(), "postCode"); if (postCode != null && postCode.getTextContent() != null) { address.setPostCode(postCode.getTextContent()); } Node settlementNode = getFirstChild(addressNode.getChildNodes(), "settlement"); if (settlementNode != null && settlementNode.getTextContent() != null) { address.setSettlment(settlementNode.getTextContent()); } Node countryNode = getFirstChild(addressNode.getChildNodes(), "country"); if (countryNode != null) { Country country = new Country(); String key = getFirstAttribute(countryNode, null, "key"); if (key != null) { country.setKey(key); } String content = countryNode.getTextContent(); if (content != null) { country.setContent(content); } address.setCountry(country); } affiliation.setAddress(address); } private void parseOrgName(Node orgNode, OrgName orgName) { OrgTypeName typeName = new OrgTypeName(); String orgContent = orgNode.getTextContent(); if (orgContent != null) { typeName.setName(orgContent); } String orgType = getFirstAttribute(orgNode, null, "type"); if (orgType != null) { typeName.setType(orgType); } orgName.getTypeNames().add(typeName); } private void parseProfileDesc(Node profileDesc, Metadata metadata) { Node abstractNode = getFirstChild(profileDesc.getChildNodes(), "abstract"); if (abstractNode != null) { Node pNode = getFirstChild(abstractNode.getChildNodes(), "p"); if (pNode != null) { metadata.add("Abstract", pNode.getTextContent()); } } Node textClassNode = getFirstChild(profileDesc.getChildNodes(), "textClass"); if (textClassNode != null) { Node keywordsNode = getFirstChild(textClassNode.getChildNodes(), "keywords"); if (keywordsNode != null) { List terms = getChildNodes(keywordsNode.getChildNodes(), "term"); if (terms.size() == 0) { // test AJ15.pdf metadata.add("Keyword", keywordsNode.getTextContent()); } else { for (Node term : terms) { metadata.add("Keyword", term.getTextContent()); } } } } } private String printOrBlank(String val) { if (val != null && !val.equals("")) { return val + " "; } else return " "; } class Author { private String surName; private String middleName; private String firstName; private List affiliations; public Author() { this.surName = null; this.middleName = null; this.firstName = null; this.affiliations = new ArrayList(); } /** * @return the surName */ public String getSurName() { return surName; } /** * @param surName the surName to set */ public void setSurName(String surName) { this.surName = surName; } /** * @return the middleName */ public String getMiddleName() { return middleName; } /** * @param middleName the middleName to set */ public void setMiddleName(String middleName) { this.middleName = middleName; } /** * @return the firstName */ public String getFirstName() { return firstName; } /** * @param firstName the firstName to set */ public void setFirstName(String firstName) { this.firstName = firstName; } /** * @return the affiliations */ public List getAffiliations() { return affiliations; } /** * @param affiliations the affiliations to set */ public void setAffiliations(List affiliations) { this.affiliations = affiliations; } /* * (non-Javadoc) * * @see java.lang.Object#toString() */ @Override public String toString() { return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName : "" + ", firstName=" + firstName + ", affiliations=" + affiliations + "]"; } } class Affiliation { private OrgName orgName; private Address address; public Affiliation() { this.orgName = new OrgName(); this.address = new Address(); } /** * @return the orgName */ public OrgName getOrgName() { return orgName; } /** * @param orgName the orgName to set */ public void setOrgName(OrgName orgName) { this.orgName = orgName; } /** * @return the address */ public Address getAddress() { return address; } /** * @param address the address to set */ public void setAddress(Address address) { this.address = address; } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { Affiliation otherA = (Affiliation) obj; return this.getAddress().equals(otherA.getAddress()) && this.getOrgName().equals(otherA.getOrgName()); } /* * (non-Javadoc) * * @see java.lang.Object#toString() */ @Override public String toString() { return "Affiliation {orgName=" + orgName + ", address=" + address + "}"; } } class OrgName { private List typeNames; public OrgName() { this.typeNames = new ArrayList(); } /** * @return the typeNames */ public List getTypeNames() { return typeNames; } /** * @param typeNames the typeNames to set */ public void setTypeNames(List typeNames) { this.typeNames = typeNames; } /* * (non-Javadoc) * * @see java.lang.Object#toString() */ @Override public String toString() { StringBuilder builder = new StringBuilder(); for (OrgTypeName on : this.typeNames) { builder.append(on.getName()); builder.append(" "); } return builder.toString(); } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { OrgName otherA = (OrgName) obj; if (otherA.getTypeNames() != null) { if (this.typeNames == null) { return false; } else { return this.typeNames.size() == otherA.getTypeNames().size(); } } else { if (this.typeNames == null) { return true; } else return false; } } } class OrgTypeName { private String name; private String type; public OrgTypeName() { this.name = null; this.type = null; } /** * @return the name */ public String getName() { return name; } /** * @param name the name to set */ public void setName(String name) { this.name = name; } /** * @return the type */ public String getType() { return type; } /** * @param type the type to set */ public void setType(String type) { this.type = type; } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { OrgTypeName otherOrgName = (OrgTypeName) obj; return this.type.equals(otherOrgName.getType()) && this.name.equals(otherOrgName.getName()); } } private class Address { private String region; private String postCode; private String settlment; private Country country; public Address() { this.region = null; this.postCode = null; this.settlment = null; this.country = new Country(); } /** * @return the region */ public String getRegion() { return region; } /** * @param region the region to set */ public void setRegion(String region) { this.region = region; } /** * @return the postCode */ public String getPostCode() { return postCode; } /** * @param postCode the postCode to set */ public void setPostCode(String postCode) { this.postCode = postCode; } /** * @return the settlment */ public String getSettlment() { return settlment; } /** * @param settlment the settlment to set */ public void setSettlment(String settlment) { this.settlment = settlment; } /** * @return the country */ public Country getCountry() { return country; } /** * @param country the country to set */ public void setCountry(Country country) { this.country = country; } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { Address otherA = (Address) obj; if (this.settlment == null) { return otherA.getSettlment() == null; } else if (this.country == null) { return otherA.getCountry() == null; } else if (this.postCode == null) { return otherA.getPostCode() == null; } else if (this.region == null) { return otherA.getRegion() == null; } return this.settlment.equals(otherA.getSettlment()) && this.country.equals(otherA.getCountry()) && this.postCode.equals(otherA.getPostCode()) && this.region.equals(otherA.getRegion()); } /* * (non-Javadoc) * * @see java.lang.Object#toString() */ @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append(settlment); builder.append(", "); builder.append(region); builder.append(" "); builder.append(postCode); builder.append(" "); builder.append(country.getContent()); return builder.toString(); } } private class Country { private String key; private String content; public Country() { this.key = null; this.content = null; } /** * @return the key */ public String getKey() { return key; } /** * @param key the key to set */ public void setKey(String key) { this.key = key; } /** * @return the content */ public String getContent() { return content; } /** * @param content the content to set */ public void setContent(String content) { this.content = content; } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { Country otherC = (Country) obj; if (this.key == null) { if (otherC.getKey() != null) { return false; } else { if (this.content == null) { if (otherC.getContent() != null) { return false; } else { return true; } } else { return content.equals(otherC.getContent()); } } } else { if (this.content == null) { if (otherC.getContent() != null) { return false; } else { return this.key.equals(otherC.getKey()); } } else { return this.key.equals(otherC.getKey()) && this.content.equals(otherC.getContent()); } } } } //returns first child with this name, null otherwise private static Node getFirstChild(NodeList childNodes, String name) { for (int i = 0; i < childNodes.getLength(); i++) { Node n = childNodes.item(i); if (n.getNodeName().equals(name)) { return n; } } return null; } private static String getFirstAttribute(Node node, String ns, String name) { if (node.hasAttributes()) { NamedNodeMap attrs = node.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if (attr.getLocalName().equals(name)) { return attr.getNodeValue(); } } } return null; } private static List getChildNodes(NodeList childNodes, String localName) { List ret = new ArrayList<>(); for (int i = 0; i < childNodes.getLength(); i++) { Node child = childNodes.item(i); if (child.getLocalName() != null && child.getLocalName().equals(localName)) { ret.add(child); } } return ret; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy