
com.worksap.nlp.sudachi.dictionary.CSVParser Maven / Gradle / Ivy
/*
* Copyright (c) 2019 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.worksap.nlp.sudachi.dictionary;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class CSVParser implements Closeable {
static class Token {
enum Type {
COMMA, DQUOTE, NL, TEXTDATA, EOF
}
final Type type;
final String content;
Token(Type type, String content) {
this.type = type;
this.content = content;
}
}
static final Token COMMA_TOKEN = new Token(Token.Type.COMMA, ",");
static final Token DQUOTE_TOKEN = new Token(Token.Type.DQUOTE, "\"");
static final Token NL_TOKEN = new Token(Token.Type.NL, "\n");
static final Token EOF_TOKEN = new Token(Token.Type.EOF, null);
static final Pattern TOKEN_PATTERN = Pattern.compile(",|\"|[^,\"]+");
static final String INVALID_FORMAT_ERROR_MESSAGE = "invalid format";
static final String UNKNOWN_ERROR_MESSAGE = "unknown error";
private BufferedReader reader;
private Deque tokenBuffer = new ArrayDeque<>();
private boolean hasNextField = false;
CSVParser(Reader reader) {
this.reader = new BufferedReader(reader);
}
@Override
public void close() throws IOException {
reader.close();
}
List getNextRecord() throws IOException {
List record = new ArrayList<>();
hasNextField = false;
String field;
while ((field = getField()) != null) {
if (field.equals("\n")) {
return record;
}
record.add(field);
}
return null;
}
private String getField() throws IOException {
if (hasNextField) {
return getNextField();
} else {
return getFirstField();
}
}
private String getFirstField() throws IOException {
Token token = getToken();
switch (token.type) {
case COMMA:
hasNextField = true;
return "";
case DQUOTE:
return getEscapedField();
case NL:
return "\n";
case TEXTDATA:
return getUnescapedField(token);
case EOF:
return null;
default:
// unreachable
throw new IllegalArgumentException(UNKNOWN_ERROR_MESSAGE);
}
}
private String getNextField() throws IOException {
Token token = getToken();
switch (token.type) {
case COMMA:
hasNextField = true;
return "";
case DQUOTE:
return getEscapedField();
case NL:
hasNextField = false;
ungetToken(NL_TOKEN);
return "";
case TEXTDATA:
return getUnescapedField(token);
case EOF:
hasNextField = false;
return "";
default:
// unreachable
throw new IllegalArgumentException(UNKNOWN_ERROR_MESSAGE);
}
}
private String getEscapedField() throws IOException {
hasNextField = false;
boolean isClosed = false;
StringBuilder content = new StringBuilder();
while (true) {
Token token = getToken();
switch (token.type) {
case NL:
if (isClosed) {
ungetToken(token);
return content.toString();
} else {
content.append('\n');
}
break;
case COMMA:
if (isClosed) {
hasNextField = true;
return content.toString();
} else {
content.append(',');
}
break;
case DQUOTE:
if (isClosed) {
content.append('"');
isClosed = false;
} else {
isClosed = true;
}
break;
case TEXTDATA:
if (isClosed) {
throw new IllegalArgumentException(INVALID_FORMAT_ERROR_MESSAGE);
} else {
content.append(token.content);
}
break;
case EOF:
if (!isClosed) {
throw new IllegalArgumentException(INVALID_FORMAT_ERROR_MESSAGE);
}
return null;
default:
// unreachable
throw new IllegalArgumentException(UNKNOWN_ERROR_MESSAGE);
}
}
}
private String getUnescapedField(Token firstToken) throws IOException {
hasNextField = false;
StringBuilder content = new StringBuilder(firstToken.content);
while (true) {
Token token = getToken();
switch (token.type) {
case COMMA:
hasNextField = true;
return content.toString();
case NL:
ungetToken(token);
return content.toString();
case TEXTDATA:
content.append(token.content);
break;
case EOF:
return null;
default:
// unreachable
throw new IllegalArgumentException(UNKNOWN_ERROR_MESSAGE);
}
}
}
private Token getToken() throws IOException {
if (!tokenBuffer.isEmpty()) {
return tokenBuffer.removeLast();
}
String line = reader.readLine();
if (line == null) {
return EOF_TOKEN;
}
Matcher matcher = TOKEN_PATTERN.matcher(line);
while (matcher.find()) {
Token token;
String content = matcher.group();
if (content.equals(",")) {
token = COMMA_TOKEN;
} else if (content.equals("\"")) {
token = DQUOTE_TOKEN;
} else {
token = new Token(Token.Type.TEXTDATA, content);
}
tokenBuffer.push(token);
}
tokenBuffer.push(NL_TOKEN);
return tokenBuffer.removeLast();
}
private void ungetToken(Token token) {
tokenBuffer.push(token);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy