org.zanata.adapter.po.PoReader2 Maven / Gradle / Ivy
/*
* Copyright 2013, Red Hat, Inc. and individual contributors
* as indicated by the @author tags. See the copyright.txt file in the
* distribution for a full listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.zanata.adapter.po;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.fedorahosted.tennera.jgettext.HeaderFields;
import org.fedorahosted.tennera.jgettext.Message;
import org.fedorahosted.tennera.jgettext.catalog.parse.MessageStreamParser;
import org.xml.sax.InputSource;
import org.zanata.common.ContentState;
import org.zanata.common.ContentType;
import org.zanata.common.LocaleId;
import org.zanata.common.util.ContentStateUtil;
import org.zanata.rest.dto.extensions.comment.SimpleComment;
import org.zanata.rest.dto.extensions.gettext.HeaderEntry;
import org.zanata.rest.dto.extensions.gettext.PoHeader;
import org.zanata.rest.dto.extensions.gettext.PoTargetHeader;
import org.zanata.rest.dto.extensions.gettext.PotEntryHeader;
import org.zanata.rest.dto.resource.Resource;
import org.zanata.rest.dto.resource.TextFlow;
import org.zanata.rest.dto.resource.TextFlowTarget;
import org.zanata.rest.dto.resource.TranslationsResource;
import org.zanata.util.HashUtil;
import org.zanata.util.ShortString;
import com.google.common.collect.ImmutableSet;
public class PoReader2 {
public static final ContentType PO_CONTENT_TYPE = new ContentType(
"application/x-gettext");
public static final ImmutableSet POT_HEADER_FIELDS = ImmutableSet
.of(HeaderFields.KEY_ProjectIdVersion,
HeaderFields.KEY_ReportMsgidBugsTo,
HeaderFields.KEY_PotCreationDate,
HeaderFields.KEY_MimeVersion, HeaderFields.KEY_ContentType,
HeaderFields.KEY_ContentTransferEncoding);
public static final ImmutableSet PO_HEADER_FIELDS = ImmutableSet
.of(HeaderFields.KEY_PoRevisionDate,
HeaderFields.KEY_LastTranslator,
HeaderFields.KEY_LanguageTeam, HeaderFields.KEY_Language,
"Plural-Forms", "X-Generator");
private boolean mapIdFromMsgctxt;
public PoReader2(boolean mapIdFromMsgctxt) {
this.mapIdFromMsgctxt = mapIdFromMsgctxt;
}
public PoReader2() {
this(false);
}
/**
* Extract contents of a PO file and convert to a TranslationsResource. NB:
* If the file contains the gettext header Content-Type, it must be set to
* ASCII, CHARSET, UTF8 or UTF-8, or an exception will occur.
*
* @param inputSource
* PO file to be extracted
* @return converted PO file as TranslationsResource
*/
public TranslationsResource extractTarget(InputSource inputSource) {
TranslationsResource document = new TranslationsResource();
MessageStreamParser messageParser = createParser(inputSource);
boolean headerFound = false;
while (messageParser.hasNext()) {
Message message = messageParser.next();
if (message.isHeader()) {
if (headerFound)
throw new IllegalStateException("found a second header!");
headerFound = true;
// add target header data
PoTargetHeader poHeader = new PoTargetHeader();
extractPoHeader(message, poHeader);
document.getExtensions(true).add(poHeader);
} else if (message.isObsolete()) {
// TODO append obsolete
} else {
String id = createId(message);
// add the target content (msgstr)
TextFlowTarget tfTarget = new TextFlowTarget();
tfTarget.setResId(id);
List sourceContents = getSourceContents(message);
tfTarget.setSourceHash(HashUtil.sourceHash(sourceContents));
tfTarget.setDescription(ShortString.shorten(message.getMsgid()));
tfTarget.setContents(getContents(message));
tfTarget.setState(getContentState(message));
// add the PO comment
tfTarget.getExtensions(true).add(
new SimpleComment(StringUtils.join(
message.getComments(), "\n")));
document.getTextFlowTargets().add(tfTarget);
}
}
return document;
}
/**
* Checks that the file is safe to read as UTF-8.
* @param hf
*/
private static void checkContentType(HeaderFields hf) {
String contentType = hf.getValue(HeaderFields.KEY_ContentType);
if (contentType == null)
return;
String ct = contentType.toLowerCase();
if (!ct.contains("charset="))
return;
if (ct.contains("charset=charset") || ct.contains("charset=ascii")
|| ct.contains("charset=utf-8") || ct.contains("charset=utf8")) {
return;
} else {
throw new RuntimeException("unsupported charset in "
+ HeaderFields.KEY_ContentType + ": " + contentType);
}
}
private static void extractPotHeader(Message message, PoHeader potHeader) {
potHeader.setComment(StringUtils.join(message.getComments(), "\n"));
HeaderFields hf = HeaderFields.wrap(message);
checkContentType(hf);
for (String key : hf.getKeys()) {
String val = hf.getValue(key);
if (POT_HEADER_FIELDS.contains(key)) {
potHeader.getEntries().add(new HeaderEntry(key, val));
}
// we add any custom fields to the PO only, not the POT
// TODO this should be configurable
}
}
private static void
extractPoHeader(Message message, PoTargetHeader poHeader) {
poHeader.setComment(StringUtils.join(message.getComments(), "\n"));
HeaderFields hf = HeaderFields.wrap(message);
checkContentType(hf);
for (String key : hf.getKeys()) {
String val = hf.getValue(key);
if (PO_HEADER_FIELDS.contains(key)) {
poHeader.getEntries().add(new HeaderEntry(key, val));
} else if (!POT_HEADER_FIELDS.contains(key)) {
// we add any custom fields to the PO only, not the POT
// TODO this should be configurable
poHeader.getEntries().add(new HeaderEntry(key, val));
}
}
}
/**
* Extract contents of a POT file and convert to a Resource. NB: If the file
* contains the gettext header Content-Type, it must be set to ASCII,
* CHARSET, UTF8 or UTF-8, or an exception will occur.
*
* @param inputSource
* POT file to be extracted
* @param sourceLocaleId
* locale of POT, used to set metadata fields
* @param docName
* name of POT file (minus .pot extension) used to set metadata
* fields
* @return converted POT file as Resource
*/
public Resource extractTemplate(InputSource inputSource,
LocaleId sourceLocaleId, String docName) {
Resource document = new Resource(docName);
MessageStreamParser messageParser = createParser(inputSource);
document.setLang(sourceLocaleId);
document.setContentType(PO_CONTENT_TYPE);
List resources = document.getTextFlows();
boolean headerFound = false;
while (messageParser.hasNext()) {
Message message = messageParser.next();
if (message.isHeader()) {
if (headerFound)
throw new IllegalStateException("found a second header!");
headerFound = true;
// store POT data
PoHeader potHeader = new PoHeader();
extractPotHeader(message, potHeader);
document.getExtensions(true).add(potHeader);
} else if (message.isObsolete()) {
// TODO append obsolete
} else {
String id = createId(message);
// add the content (msgid)
TextFlow tf = new TextFlow(id, sourceLocaleId);
tf.setPlural(message.isPlural());
if (message.isPlural()) {
tf.setContents(message.getMsgid(), message.getMsgidPlural());
} else {
tf.setContents(message.getMsgid());
}
resources.add(tf);
// add the entry header POT fields
tf.getExtensions(true).add(createPotEntryHeader(message));
tf.getExtensions().add(createSimpleComment(message));
}
}
return document;
}
private static PotEntryHeader createPotEntryHeader(Message message) {
PotEntryHeader data = new PotEntryHeader();
if (message.getMsgctxt() != null) {
data.setContext(message.getMsgctxt());
}
data.getFlags().addAll(message.getFormats());
data.getReferences().addAll(message.getSourceReferences());
return data;
}
private static SimpleComment createSimpleComment(Message message) {
String comment = StringUtils.join(message.getExtractedComments(), "\n");
SimpleComment result = new SimpleComment(comment);
return result;
}
static MessageStreamParser createParser(InputSource inputSource) {
MessageStreamParser messageParser;
if (inputSource.getCharacterStream() != null)
messageParser =
new MessageStreamParser(inputSource.getCharacterStream());
else if (inputSource.getByteStream() != null) {
if (inputSource.getEncoding() != null)
messageParser =
new MessageStreamParser(inputSource.getByteStream(),
Charset.forName(inputSource.getEncoding()));
else
messageParser =
new MessageStreamParser(inputSource.getByteStream(),
Charset.forName("UTF-8"));
} else if (inputSource.getSystemId() != null) {
try {
URL url = new URL(inputSource.getSystemId());
if (inputSource.getEncoding() != null)
messageParser =
new MessageStreamParser(url.openStream(),
Charset.forName(inputSource.getEncoding()));
else
messageParser =
new MessageStreamParser(url.openStream(),
Charset.forName("UTF-8"));
} catch (IOException e) {
// TODO throw stronger typed exception
throw new RuntimeException(
"failed to get input from url in inputSource", e);
}
} else
// TODO throw stronger typed exception
throw new RuntimeException("not a valid inputSource");
return messageParser;
}
private List getSourceContents(Message message) {
List sourceContents;
if (message.isPlural()) {
sourceContents = Arrays.asList(message.getMsgid(), message.getMsgidPlural());
} else {
sourceContents = Arrays.asList(message.getMsgid());
}
return sourceContents;
}
/**
* Returns the contents of the Message (msgstr for singular, msgstr_plural
* for plural) Also ensures at least one entry.
*
* @param message
* @return
*/
private static List getContents(Message message) {
if (message.isPlural()) {
List plurals = message.getMsgstrPlural();
if (plurals.isEmpty()) {
return Arrays.asList("");
}
return plurals;
} else {
return Arrays.asList(message.getMsgstr());
}
}
// NB: we don't check that the number of msgstr_plurals matches nplurals on
// the client, only on the server
static ContentState getContentState(Message message) {
ContentState requestedState =
message.isFuzzy() ? ContentState.NeedReview
: ContentState.Translated;
List contents = getContents(message);
return ContentStateUtil.determineState(requestedState, contents);
}
/**
* Generate or extract id from message, using this reader's setting for
* whether to extract the id from msgctxt.
*
* @see #createId(Message, boolean)
*/
private String createId(Message message) {
return createId(message, mapIdFromMsgctxt);
}
/**
* Generate or extract id from a message.
*
* If id is extracted from msgctxt, the value of msgctxt is cleared and this
* method will throw an exception on subsequent calls with the same message.
*
* @param message
* @param mapIdFromMsgctxt
* true to extract id from msgctxt and set msgctxt to null,
* otherwise id is generated by hashing msgctxt and msgid.
* @return extracted or generated id.
*
* @throws RuntimeException
* if called with mapIdFromMsgctxt=true but msgctxt is null or
* empty.
*/
static String createId(Message message, boolean mapIdFromMsgctxt) {
if (mapIdFromMsgctxt) {
String zanataId = message.getMsgctxt();
// null or empty id is not valid
if (zanataId == null || zanataId.isEmpty()) {
// TODO throw stronger typed exception
throw new RuntimeException(
"Tried to map id from msgctxt but msgctxt was "
+ zanataId + " for text flow with source: "
+ message.getMsgstr());
}
// we do not want this msgctxt stored on the server
message.setMsgctxt(null);
return zanataId;
}
String sep = "\u0000";
String hashBase =
message.getMsgctxt() == null ? message.getMsgid() : message
.getMsgctxt() + sep + message.getMsgid();
return HashUtil.generateHash(hashBase);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy