All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.dev.wikipedia.IpaExtractor Maven / Gradle / Ivy

There is a newer version: 6.3
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.wikipedia;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Extract IPA information from Wikipedia XML dump. Might be used to
 * find words with difficult spelling. Note that only up to one IPA
 * information is found per article.
 *
 * Example Wikitext that is detected:
 * 
'''Trance''' [{{IPA|trɑ̃s}}]
* @since 2.9 */ class IpaExtractor { private static final Pattern FULL_IPA_PATTERN = Pattern.compile("'''?(.*?)'''?\\s+\\[?\\{\\{IPA\\|([^}]*)\\}\\}"); private static final Pattern IPA_PATTERN = Pattern.compile("\\{\\{IPA\\|([^}]*)\\}\\}"); private int articleCount = 0; private int ipaCount = 0; public static void main(String[] args) throws XMLStreamException, FileNotFoundException { if (args.length == 0) { System.out.println("Usage: " + IpaExtractor.class.getSimpleName() + " "); System.exit(1); } IpaExtractor extractor = new IpaExtractor(); for (String filename : args) { FileInputStream fis = new FileInputStream(filename); extractor.run(fis); } System.err.println("articleCount: " + extractor.articleCount); System.err.println("IPA count: " + extractor.ipaCount); } private void run(FileInputStream fis) throws XMLStreamException { XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader reader = factory.createXMLEventReader(fis); String title = null; while (reader.hasNext()) { XMLEvent event = reader.nextEvent(); if (event.getEventType() == XMLStreamConstants.START_ELEMENT) { String elementName = event.asStartElement().getName().getLocalPart(); switch (elementName) { case "title": XMLEvent nextEvent = reader.nextEvent(); title = nextEvent.asCharacters().getData(); articleCount++; break; case "text": ipaCount += handleTextElement(title, reader); break; } } } } private int handleTextElement(String title, XMLEventReader reader) throws XMLStreamException { XMLEvent event = reader.nextEvent(); StringBuilder sb = new StringBuilder(); while (event.isCharacters()) { sb.append(event.asCharacters().getData()); event = reader.nextEvent(); } String wikiText = sb.toString(); int index = wikiText.indexOf("{{IPA"); if (index != -1) { Matcher matcher = FULL_IPA_PATTERN.matcher(wikiText); if (matcher.find()) { System.out.println(title + ": " + matcher.group(1) + " -> " + matcher.group(2)); return 1; } else { Matcher matcher2 = IPA_PATTERN.matcher(wikiText); if (matcher2.find()) { System.out.println(title + ": " + matcher2.group(1)); return 1; } else { System.out.println(title + ": (no pattern found)"); } } } return 0; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy