is2.util.ExtractParagraphs Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of anna Show documentation
Show all versions of anna Show documentation
Tools for Natural Language Analysis, Generation and Machine Learning
The newest version!
package is2.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.StringTokenizer;
public class ExtractParagraphs {
/**
* @param args
* @throws IOException
*/
public static void main(String args[]) throws IOException {
if (args.length<1) {
System.out.println("Please provide a file name.");
System.exit(0);
}
File file = new File(args[0]);
file.isDirectory();
String[] dirs = file.list();
BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]),"UTF-8"),32768);
int cnt=0;
for (String fileName : dirs) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+fileName),"UTF-8"),32768);
int state =0;
String s;
while ((s = reader.readLine()) != null) {
if (s.startsWith("")||s.startsWith("
")) {
state=1; // paragraph start
continue;
}
if (s.startsWith("
")||s.startsWith("")) {
state=2; // paragraph end
write.newLine();
}
boolean lastNL =false;
if (state==1) {
String sp[] = s.split("\\. ");
for(String p : sp) {
write.write(p);
// if (sp.length>1) write.newLine();
}
cnt++;
}
}
//if (cnt>5000) break;
reader.close();
}
write.flush();
write.close();
System.out.println("Extract "+cnt+" lines ");
}
}