relations.ChunkAnalyzer Maven / Gradle / Ivy

Go to download
package relations;

import parser.Parser;

import java.util.*;

/**
 *
 * @author Chinh
 * @Date: Jun 16, 2011
 */
/**
 * 
 * Define all operations on Chunk such as forming NP, VP, PP, AP
 */
public class ChunkAnalyzer {

    public ChunkAnalyzer() {
        prepMap.addAll(Arrays.asList(prepList));
        breakMap.addAll(Arrays.asList(breakList));
        skipMap.addAll(Arrays.asList(skipList));
        ccMap.addAll(Arrays.asList(ccList));
        appoMap.addAll(Arrays.asList(appoList));
        beMap.addAll(Arrays.asList(to_be));
        allMap.addAll(ccMap);
        allMap.addAll(breakMap);
    }
    /**
     * input: list of chunks from single sentence
     * output: Basic chunks (PP, NP) and Verb chunks (subject/object)
     * Drop chunks: chunks that can not extract event (no Pro and no Trg)
     */
    List bsList = new ArrayList();
    List verbList = new ArrayList();
    boolean has_stop = false, shared_sub = false, reduced_clause = false;
    boolean has_breaker = false, next_clause = false;
    boolean has_comma = false;
    boolean use_prev_obj = false;
    String pos_tags[], stokens[];
    public String curr_text;
    
    public void cleanChunk(List ls, String[] tags, String[] tokens) {
        pos_tags = tags;
        stokens = tokens;
        dropChunks(ls);
        groupVerbPhrase(ls);
        printChunk(ls);
    }

    /**
     * TODO list:
     * 1. and/or in NP followed by VP -> split NP to get subj for VP
     * 2. VB is embbeded in NP/ ADVP -> split/or change to VP
     * 3. PP +VBing -> use [NP] in front of this VP
     * 4. Apposition vs. Coordination: NP contains and/or -> coordination
     * 5. NP,NP,VP -> apposition
     * 6. Two coordination: coord1 and coord2 -> sub-coord, then global coord
     * 7. [where/when/while] break
     * 8. [SBAR] sub-clause -> [O ,] for break
     * 9. Remove [O like: -, ', "]
     * 
     */
    /**
     * Analyzing chunks to form NP, VP...
     * @param ls : chunks from parser
     * Output: basic chunks stored in bsList
     *         verb chunks stored in verbList 
     */
    public void analyzeChunk(List ls, String tags[], String tokens[]) {
        Set shared = new HashSet();
        bsList.clear();
        verbList.clear();
        pos_tags = tags;
        stokens = tokens;
        debug = false;
        int i = 0;
        Chunk cur_vb, br = null, c;
        c = ls.get(ls.size() - 1);
        if (!c.txt.equals(".")) {
            Chunk o_c = new Chunk("O");
            o_c.txt = ".";
            o_c.begin = c.end + 1;
            o_c.end = c.end + 2;
            ls.add(o_c);
        }
        dropChunks(ls); // remove [O] chunk
        groupVerbPhrase(ls); // combining verb phrases and drop ADVPs neighbouring them
        dropChunks(ls); // remove [O] chunk
        List vbs = findVerbChunk(ls);
        List breaker = findBreaker(ls);
        int vbidx , br_idx = 0, sub_type ;
        BasicChunk prev_sub = null, sub , object, prev_obj = null; // prevsubject and current subject
        int start = 0, end = 0, next_start = 0, vb_type;
        boolean merged ;
        next_pos = 0;
        has_stop = false;
        shared_sub = false;
        has_breaker = false;
        /**
         * Work flow:
         * 1. get verb chunk
         * 2. check preceeded chunks: [which]; ([,])[and,but,or] -> subject
         * 3. check succeded chunks: [that, whether] ;[breaker, ;] ; [VP] ; [O.]-> object
         * 4. get verb chunk type 
         * 5. find subject ; object
         * 6. assign subject; object for verb chunk if it contains trigger
         */
        while (i < vbs.size()) {
            cur_vb = vbs.get(i);
            vbidx = ls.indexOf(cur_vb); // position of verb in the list
            merged = false ;
            vb_type = getVerbChunkType(cur_vb); // type of verb chunk: active;passive;VBing;to VB
            if (!shared_sub) {
                start = skipPhrase(ls, start, vbidx); // skip ADVP or PP that do not contain trigger and key.
                if(vb_type==2 && start == vbidx){
                    merged =true ;
                }
            }
            use_prev_obj = false;
            // check breaker in front of verb chunk
            if (br_idx < breaker.size() || br != null) { // find breaker in front and behind verb chunk
                if (br_idx < breaker.size() && br == null) { // first breaker
                    br = breaker.get(br_idx); // has breaker
                    br_idx++;
                }
                int pos = ls.indexOf(br); // breaker position
                if (((br.txt.equals("whether") || br.txt.equals("that")) && br.type.equals("PP"))
                        || br.type.equals("SBAR")) { // left_over breaker
                    if (pos > 0 && br.begin < cur_vb.begin) { // belongs to the previous VB
                        if (start < pos) {
                            BasicChunk bs = new BasicChunk();
                            for (int k = start; k < pos; k++) {
                                bs.addChunk(ls.get(k));
                            }
                            bsList.add(bs);
                            start = pos + 1;
                        }
                        br = breaker.get(br_idx);
                        br_idx++;
                        pos = ls.indexOf(br);
                    }
                }
                if (br.begin < cur_vb.begin) { // found a breaker in front of verb chunk
                    // check for which/that : which/that VBx
                    if (br.type.equals("NP") && vbidx - pos <= 2 && (br.txt.equals("which") || br.txt.equals("that"))) {
                        if (!hasNounChunk(pos + 1, ls, vbidx)) {
                            use_prev_obj = true;//use NP in front of this chunk as subj
                            has_breaker = true;
                        } else {
                            if (start < pos) {
                                BasicChunk bs = new BasicChunk();
                                for (int k = start; k < pos; k++) {
                                    bs.addChunk(ls.get(k));
                                }
                                bsList.add(bs);
                                start = pos + 1;
                            }
                        }
                    } else {
                        // Copy skip part and put in BasicChunk:
                        if (start < pos) {
                            BasicChunk bs = new BasicChunk();
                            for (int k = start; k < pos; k++) {
                                bs.addChunk(ls.get(k));
                            }
                            bsList.add(bs);
                            start = pos + 1;
                        }
                        has_breaker = true;
                    }
                    while (br_idx < breaker.size() && br.begin < cur_vb.begin) {
                        br = breaker.get(br_idx);
                        br_idx++;
                    }
                }// repeat until no more breaker in front of this verb chunk
                // check breaker behind verb chunk
                // prepare data for object and next clause
                if (reduced_clause) { // set by previous loop
                    use_prev_obj = true;
                    reduced_clause = false;
                    merged = true ;
                }
                if (i + 1 < vbs.size()) { // has next verb
                    Chunk temp = vbs.get(i + 1);
                    int next_vb_type = getVerbChunkType(temp);
                    int next_vb = ls.indexOf(temp); // position of next verb chunk
                    pos = ls.indexOf(br);// position of breaker
                    if ((next_vb > pos)
                            && (br.txt.equals("that") || br.txt.contains("whether"))
                            && (br.type.equals("PP") || br.type.equals("SBAR"))) { // belong to current verb chunk
                        next_start = pos + 1; // position of new clause
                        has_stop = true; // start new clause ; reset all prev values
                        end = pos - 1;
                        br = null;
                    } else if (br.txt.startsWith(";") && pos < next_vb && pos > vbidx) {
                        has_stop = true; // start new clause ; reset all prev values
                        next_start = pos + 1; // position of new clause
                        end = pos - 1;
                        br = null;
                    } else if (pos > vbidx && pos < next_vb) { // VB BR VB
                        if (!(br.type.equals("NP") && next_vb - pos <= 2 && (br.txt.equals("which") || br.txt.equals("that")))) {
                            has_stop = true;
                        } else {
                            next_clause = true;
                        }
                        end = pos - 1;
                        next_start = pos + 1; // position of new clause
                    } else {
                        // set end position for object
                        List comlist = getCommaList(vbidx + 1, ls, next_vb);
                        List conjlist = getConjList(vbidx + 1, ls, next_vb);
                        if (next_vb_type == 2 && next_vb_type != vb_type && comlist.isEmpty() && conjlist.isEmpty() && !has_breaker) { // reduced clause
                            reduced_clause = true;
                            next_start = next_vb;
                        } else {
                        has_stop = false;
                        }
                        end = next_vb - 1; // longest possible (this might include NP of next subject)
                    }
                } else { // last verb
                    if (br.txt.equals(".")) {
                        end = ls.size() - 1;
                        next_pos = end;
                    } else {
                        int br_pos = ls.indexOf(br);// position of breaker
                        end = br_pos - 1;
                        next_pos = br_pos + 1; // position of new clause
                    }
                    has_stop = true;
                }

            } else { // no breaker
                System.out.println("ChunkAnalyzer: analyze >>--------------NO BREAKER --------> NERVER HAPPEND----------");
                System.exit(1);
            }
            sub_type = 0;
            // finding subject
            if (use_prev_obj) { // which / that in fonr to verb chunk
                if (prev_obj == null ) { // this is the first verb, -> find subject
                    sub = findSubject(start, ls, vbidx - 1);
                } else { // second verb
                    sub = findPreviousNP(prev_obj); // find the closest NP from the preceded object
                    sub_type=1;
                }
            } else if (shared_sub) { // this flag set by previous verb ; main verb chunk of relative clause;
                if (prev_sub != null) { // no NP left to use as subject therefore use previous object
                    sub = prev_sub ;// cloneChunk(prev_sub); // use previous subject
                    sub_type=1;
                    //sub.extracted = true ; // avoid duplicating extraction -> only use pro
                } else {
                    System.out.println("Chunk Analyzer: Analyze: ----> BUG---------> shared subject --> null");
                    System.out.println(cur_vb.getText() + " ");
                    printChunk(ls);
                    sub = findSubject(start, ls, vbidx - 1); // first verb
                }
            } else { // normal case and unknown case
                if (start < vbidx) {
                    sub = findSubject(start, ls, vbidx - 1);
                } else {
                    sub = new BasicChunk();
                    debug = true ;
                    sub_count++ ;
                }
            }
            // now find object
            if (has_stop || reduced_clause) { // breaker or relative clause
                object = new BasicChunk();
                for (int k = vbidx + 1; k < end; k++) {
                    object.addChunk(ls.get(k));
                }
                if (!ls.get(end).type.equals("O") && vbidx + 1 <= end) { // last verb ; every thing belong to object
                    if (!ls.get(end).txt.equals(",")
                            && !(ls.get(end - 1).txt.equals(",") && ccMap.contains(ls.get(end).txt))
                            && !ccMap.contains(ls.get(end).txt)) {
                        object.addChunk(ls.get(end));
                    }
                }
            } else {
                object = findObject(vbidx + 1, ls, end); // must detect subj(NP) of the next verb
            }
            // set up verb chunk
            VerbChunk verb = new VerbChunk();
            verb.verb = cur_vb;
            verb.subject = sub;
            verb.object = object;
            verb.subject_type = sub_type;
            if(sub==null || sub.isEmpty()){ // 15-11-2011: Test new case
                if(vb_type==0 && !pos_tags[cur_vb.begin].equals("VBN")){
                    if(prev_obj!=null){
                        verb.subject = findPreviousNP(prev_obj);
                        verb.subject_type =1 ;
                    }
                }
            }
            verb.verb_type = vb_type;
            if(!verb.isQualify()&& (verb.subject_type==1||verb.subject.isEmpty())){ // 22-12-2011
                merged =true ;
            }
            if(!merged || verbList.isEmpty()){
                verbList.add(verb);
            }else {
                VerbChunk verb1 = verbList.get(verbList.size()-1);
                verb1.object.addChunk(cur_vb);
                verb1.object.addChunk(object);
            }
            // reset values
            if (prev_sub == null || !use_prev_obj) {
                prev_sub = sub;
            }
            prev_obj = object;
            if (has_stop) {
                start = next_start;
                has_stop = false;
                has_breaker = false;
                prev_obj = null;
                prev_sub = null;
                shared_sub = false;
            } else if (reduced_clause) {// setup values for the next loop
                start = next_start;
            } else {
                start = next_pos;
                next_start = 0;
            }
            i++;// next verb  chunk
        }// while loop
        if (next_pos < ls.size() - 1 || vbs.isEmpty()) {
            BasicChunk bsc = new BasicChunk();
            for (i = next_pos; i < ls.size(); i++) {
                bsc.addChunk(ls.get(i));
            }
            if (bsc.proCount() > 0 && bsc.trgCount() > 0) {
                bsList.add(bsc);
            }
        }
        List rlist = new ArrayList();
        for(VerbChunk vc: verbList){
            if(vc.isQualify()&& vc.verb.trigs.isEmpty()&& !vc.subject.isQualify()&& !vc.object.isQualify()&& vc.subject_type!=1){
                rlist.add(vc);
            }
        }
        if(rlist.size()>0){
            for(VerbChunk vc:rlist){
                verbList.remove(vc);
                bsList.add(vc.merge());
            }
        }
    }
    int sub_count = 0;

    private boolean hasNounChunk(int start, List ls, int stop) {
        Chunk c;
        for (int i = start; i <= stop; i++) {
            c = ls.get(i);
            if (c.type.equals("NP") && !(c.txt.equals("that") || c.txt.equals("which"))) {
                return true;
            }
        }
        return false;
    }

    /**
     * Determine verb chunk type: passive/active/gerund
     * @param vb
     * @return:0 -> active; 1 -> passive ;2 -> VBing ; 3 -> to VB
     */
    public int getVerbChunkType(Chunk vb) {
        boolean has_be = false;
        boolean VBN = false;
        if (vb.txt.toLowerCase().startsWith("to ") && vb.end > vb.begin) {
            return 4;
        }
        for (int i = vb.begin; i <= vb.end; i++) {
            if (beMap.contains(stokens[i])) {
                has_be = true;
            }
            if (pos_tags[i].equals("VBN")) {
                VBN = true;
            }
        }
        if (pos_tags[vb.end].equals("VBG") && !has_be) {
            return 3;
        }
        if ((vb.end > vb.begin && has_be) && VBN) {
            return 1;
        }
        if (VBN) {
            return 2;
        }
        return 0;
    }

    /**
     * First step: remove 
     * @param ls 
     */
    public int skipPhrase(List ls, int start, int end) {
        if (start == end) {
            return start;
        }
        int begin = start;
        if (ls.get(begin).txt.equals(",")) {
            begin++;
        }
        if (ccMap.contains(ls.get(begin).txt)) {
            begin++;
        }
        if (end - begin > 2 && end < ls.size() - 3 && begin >= 0) {
            if (ls.get(begin).type.equals("ADVP") && ls.get(begin + 1).txt.equals(",")) {
                begin += 2;
            }
            Chunk c = ls.get(begin);
            if (c.type.equals("PP") || (c.type.equals("VP") && c.txt.toLowerCase().startsWith("to ")) || c.type.equals("SBAR")) { // PP or VP
                List comma = getCommaList(start, ls, end);
                List conj = getConjList(start, ls, end);
                BasicChunk bs = new BasicChunk();
                if (comma.size() == 1) { // this might be the separator between object/subject
                    // use "," to separate two clauses
                    Chunk com = comma.get(0);
                    int com_pos = ls.indexOf(com);
                    if (!hasNounChunk(com_pos, ls, end)) {
                        return begin;
                    } else {
                        end = com_pos - 1;
                        // add to BasicChunk
                        for (int i = begin; i <= end; i++) {
                            bs.addChunk(ls.get(i));
                        }
                        bsList.add(bs);
                        return com_pos + 1;
                    }
                } else if (comma.isEmpty()) { // no comma, may be never happened
                    return begin;
                } else { // more than one comma, find the last comma
                    if (conj.size() > 0) {
                        // take last conj and comma
                        Chunk and = conj.get(conj.size() - 1);
                        Chunk com = comma.get(comma.size() - 1);
                        if (and.begin == com.begin + 1) { // part of co-ordination
                            int pos = ls.indexOf(and);
                            if (ls.get(pos + 1).type.equals("NP")) { //,and NP
                                // find commas that are part of this co-ordination
                                int idx = 0;
                                for (int i = pos - 1; i > begin; i--) {
                                    c = ls.get(i);
                                    if (c.txt.equals(",") || c.type.equals("NP")) {
                                        continue;
                                    } else {
                                        idx = c.begin;
                                        break;
                                    }
                                }
                                for (int i = comma.size() - 1; i >= 0; i--) {
                                    c = comma.get(i);
                                    if (c.begin < idx) {
                                        pos = ls.indexOf(c);
                                        for (int j = begin; j <= pos; j++) {
                                            bs.addChunk(ls.get(j));
                                        }
                                        bsList.add(bs);
                                        return pos + 1;
                                    }
                                }
                            }
                        }
                    }
                    // no conj or con_pos < com_pos
                    Chunk com = comma.get(0); // assume the first coma is the separator
                    int pos = ls.indexOf(com);
                    if (hasNounChunk(pos, ls, end)) {
                        end = pos - 1;
                        for (int i = begin; i <= end; i++) {
                            bs.addChunk(ls.get(i));
                        }
                        bsList.add(bs);
                        return pos + 1;
                    }

                }


            } // PP or VP
        }
        return begin;
    }

    /**
     * Drop some ADVP and [O] chunks
     * @param ls 
     */
    private void dropChunks(List ls) {
        List list = new ArrayList();
        Chunk c, next, prev = null;
        int begin = 0, end = ls.size() - 1;
        if (ls.get(begin).type.equals("VP")) { // drop verb chunk
            Chunk c1;
            boolean remove = true;
            c1 = ls.get(begin);
            if (c1.trigs.isEmpty()) {
                List comma = getCommaList(begin, ls, end);
                int pos = comma.isEmpty() ? -1 : ls.indexOf(comma.get(0));
                if (pos > 0) {
                    for (int i = begin; i < pos; i++) {
                        c1 = ls.get(i);
                        if (c1.pros.size() > 0 || c1.trigs.size() > 0) {
                            remove = false;
                            break;
                        }
                    }
                }
                if (pos > 0 && remove) {
                    while (pos >= 0) {
                        ls.remove(0);
                        pos--;
                    }
                } else {
                    c1 = ls.get(1);
                    if ((c1.type.equals("PP") || c1.type.equals("SBAR"))
                            && (c1.txt.equals("that") || c1.txt.equals("whether"))) {
                        ls.remove(0);
                        ls.remove(0);
                    }
                }
            }
        }
        for (int i = 0; i < ls.size(); i++) {
            c = ls.get(i);
            if (skipMap.contains(c.txt) && c.type.equals("O")) {
                list.add(c);
            } else if (c.type.equals("ADVP") && !allMap.contains(c.txt)) {
                if (c.pros.isEmpty() && c.trigs.isEmpty()) {
                    if (i + 1 < ls.size()) {
                        next = ls.get(i + 1);
                        if (prev != null && !(prev.txt.equals(",") && next.txt.equals(","))) {
                            list.add(c);
                        }
                    }
                }
            }
            prev = c;
        }
        for (Chunk ch : list) {
            ls.remove(ch);
        }
    }

    /**
     * For 'which' and 'that' relative clause
     * @param start
     * @param chunk
     * @param stop
     * @return 
     */
    public BasicChunk findPreviousNP(BasicChunk obj) {
        BasicChunk bs = new BasicChunk();
        if(true){
            return obj ;
        }
        int i = obj.chunkList.size()-1;
        boolean found =false ;
        while (i >0) { // skip prep and vp
            if (obj.chunkList.get(i).type.equals("PP")) {
                found =true ;
                break;
            } else {
                i--;
            }
        }
        if(found){
            i=i+1 ;
        }else{
            i = 0;
        }
        if (i < obj.chunkList.size()) {
            for (; i < obj.chunkList.size(); i++) {
                bs.addChunk(obj.chunkList.get(i));
            }
        }
        bs.extracted  = true ; //avoid duplicating extraction
        return bs;
    }

    private BasicChunk cloneChunk(BasicChunk bc) {
        BasicChunk chunk = new BasicChunk();
        for (int i = 0; i < bc.chunkList.size(); i++) {
            chunk.addChunk(bc.chunkList.get(i));
        }
        return chunk;
    }
    
    public BasicChunk findSubject(int start, List chunk, int stop) {
        BasicChunk sub = new BasicChunk();
        Chunk c, next;
        if (stop >= 0) {
            next = chunk.get(stop);
        } else {
            if(start BUG in findSubject, start< stop");
                System.exit(0);
            }
            return sub;
        }
        if (next.txt.equals("that") || next.txt.equals("which")) {
            if (stop - start >= 2) {
                c = chunk.get(stop - 1);
                if (c.txt.equals(",")) { // has comma
                    stop = stop - 2;
                } else { // no comma
                    stop = stop - 1;
                }
            }
        } else if (next.txt.equals(",")||ccMap.contains(next.txt)) { // commar before verb
            stop--;
        }
        for (int i = start; i <= stop; i++) {
            sub.addChunk(chunk.get(i));
        }
        return sub;
    }

    /**
     * print chunks contain marker word
     * @param chunk 
     */
    public void printChunk(List chunk) {
        for (Chunk c : chunk) {
            System.out.print("[" + c.type + " " + c.txt + "]");
        }
        System.out.println("");
    }

    private void printChunk(int start, List chunk, int stop) {
        for (int t = start; t <= stop; t++) {
            System.out.print("[" + chunk.get(t).type + " " + chunk.get(t).txt + "] ");
        }
        System.out.println("");
    }
    /**
     * Get list of verb
     * @param chunk
     * @return 
     */
    boolean print = false;

    private List getConjList(int start, List ls, int end) {
        List conj = new ArrayList();
        for (int i = start; i <= end; i++) {
            if (ccMap.contains(ls.get(i).txt)) {
                conj.add(ls.get(i));
            }
        }
        return conj;
    }

    private List findVerbChunk(List chunk) {
        List ls = new ArrayList();
        int i = 0;
        Chunk c, prev = null;
        while (i < chunk.size()) {
            c = chunk.get(i);
            if (c.type.equals("VP")) {
                if (getVerbChunkType(c) < 3 && !c.txt.startsWith("as ")) { // skip VB+ing and To VB
                    ls.add(c);
                }
            }
            prev = c;
            i++;
        }
        return ls;
    }
    int next_pos = 0; // set position of last used chunk
    public boolean print_chunk = false;
    public int count_chunk = 0;

    public BasicChunk findObject(int start, List ls, int end) {
        // end: end of sentence or next verb chunk
        // Patterns: VB NP.....,NP VB; 
        // next verb is reduced clause
        BasicChunk obj = new BasicChunk();
        List comma = getCommaList(start, ls, end);
        List conj = getConjList(start, ls, end);
        shared_sub = false;
        if (next_clause) { // which /that
            next_pos = end + 1;
            next_clause = false;
        } else if (has_breaker) {// which, that, when, while...
            if (comma.size() > 0) { // check commar in front of next verb
                Chunk com = comma.get(comma.size() - 1);
                int com_pos = ls.indexOf(com);
                if (!hasNounChunk(com_pos, ls, end)) { // no NP between comma and VB
                    shared_sub = true; // set flag to use shared subject
                } else {
                    next_pos = com_pos + 1;
                    end = com_pos - 1;
                }
            } else if (conj.isEmpty()||isShare(start,end,ls)) { // no separator ; take all chunk
                shared_sub = true;
            }else { // use all 
                shared_sub = true;
//                System.out.println("----> Find Object: unknown case-- has breaker --: start :"+start+" end: "+end );
//                printChunk(ls);
//                System.out.println("---Text: "+curr_text);
            }
        } else if(isShare(start,end,ls)){
            shared_sub = true;
            if(ls.get(end-1).txt.equals(",")){
                end =end -2 ;
            }
        } else if (comma.size() == 1) { // this might be the separator between object/subject
            if (conj.size() > 0) {
                Chunk and = conj.get(conj.size() - 1);
                int conj_pos = ls.indexOf(and);
                if (!hasNounChunk(conj_pos, ls, end)) { // share subject
                    shared_sub = true; // set flag to use shared subject
                } else { // conjunction clause
                    Chunk com = comma.get(0);
                    int com_pos = ls.indexOf(com);
                    if (com.begin < and.begin && conj_pos - com_pos == 1) { // set end point for object
                        end = com_pos - 1;
                        // skip [, and]
                        next_pos = conj_pos + 1;
                    } else {
                        if (!hasNounChunk(com_pos, ls, end)) {
                            shared_sub = true;
                        } else { //use comma as separator
                            next_pos = com_pos + 1;
                            end = com_pos - 1; // nerver happend
//                            System.out.println("---> Find Object: ---> Never happened but happened! ");
//                            System.out.println("----> Find Object: unknown case: start :"+start+" end: "+end );
//                            printChunk(ls);
//                            System.out.println("---Text: "+curr_text);
                        }
                    }
                }
            } else { // use "," to separate two clauses
                Chunk com = comma.get(0);
                int com_pos = ls.indexOf(com);
                if (!hasNounChunk(com_pos, ls, end)) { // share subject
                    shared_sub = true; // set flag to use shared subject
                } else {
                    next_pos = com_pos + 1;
                    end = com_pos - 1;
                }
            }
        } else if (comma.isEmpty()) { // find conjunction: and/but/or
            if (conj.size() > 0) {
                Chunk and = conj.get(conj.size() - 1);
                int conj_pos = ls.indexOf(and);
                if (!hasNounChunk(conj_pos, ls, end)) { // share subject
                    shared_sub = true; // set flag to use shared subject
                } else {
                    next_pos = conj_pos + 1;
                }
                end = conj_pos - 1;
            } else { // no comma ; no conj ->> take all chunk until next verb chunk ;
                //shared_sub = true; // set flag to use shared subject
                next_pos=end+1 ;
            }
        } else { // more than one comma, find the last comma
            boolean set = false;
            if (conj.size() > 0) {
                // take last conj and comma
                Chunk and = conj.get(conj.size() - 1);
                Chunk com = comma.get(comma.size() - 1);
                if (and.begin > com.begin) { // , and 
                    int pos = ls.indexOf(and);
                    if (hasNounChunk(pos, ls, end)) { // assume start of next clause
                        if (checkCoord(start, ls, pos)) {
                            next_pos = pos + 1;
                            end = pos - 1;
                        } else { // conj is a part of co-ordination
                            // don't know to to split
                            // assume shared now,
                         next_pos = end+1;  // shared_sub = true;
                        }
                    } else { // assume shared subject
                        shared_sub = true; // set flag to use shared subject
                    }
                    set = true;
                }
            }
            if (!set) { // no conj or con_pos < com_pos
                Chunk com = comma.get(comma.size() - 1);
                int pos = ls.indexOf(com);
                if (hasNounChunk(pos, ls, end)) { // assume start of next clause
                    next_pos = pos + 1;
                    end = pos - 1;
                } else { // assume shared subject
                    shared_sub = true; // set flag to use shared subject
                }
            }
        }
        if (shared_sub) {
            next_pos = end + 2;// skip verb
        }
        has_breaker = false;
        for (int i = start; i < end; i++) {
            obj.addChunk(ls.get(i));
        }
        if (end >0 && end < ls.size()&& !ls.get(end).txt.equals(",")&&
             !(ls.get(end-1).txt.equals(",") && ccMap.contains(ls.get(end).txt)) &&
               !ccMap.contains(ls.get(end).txt) ) {
            obj.addChunk(ls.get(end));
        }
        return obj;
    }

    private List findBreaker(List ls) {
        List list = new ArrayList();
        for (Chunk c : ls) {
            if (breakMap.contains(c.txt)) {
                list.add(c);
            } else if (c.txt.startsWith(";") || c.type.equals("SBAR")&& !c.txt.equals("as")) {
                list.add(c);
            }
        }
        return list;

    }

    public boolean is_a_phrase(Chunk verb) {
        if (beMap.contains(stokens[verb.end])) {
            return true;
        }
        return false;
    }

    private boolean isShare(int start, int end, List ls){
        if(end - start>=1){
           Chunk c1 = ls.get(end);
           if(ccMap.contains(c1.txt)){
               return true ;
           }
        }
        return false;
    }
    
    public boolean is_passive(Chunk verb) {
        boolean has_be = false;
        int count = 0;
        for (int i = verb.begin; i <= verb.end; i++) {
            if (beMap.contains(stokens[i])) {
                has_be = true;
            }
            if (pos_tags[i].equals("VBN")) {
                count++;
            }
        }
        if (verb.trigs.size() == 1) {
            Word w = verb.trigs.get(0);
            if (w.pos_tag.equals("VBN") && has_be) {
                return true;
            }
        }
//        if (has_be && count >= 1) {
//            return true;
//        }
        return false;
    }

    /**
     * Find commas in a list of chunk
     * @param start
     * @param chunk
     * @return 
     */
    private List getCommaList(int start, List chunk, int end) {
        List comma = new ArrayList();
        Chunk c = null;
        for (int i = start; i <= end; i++) {
            c = chunk.get(i);
            if (c.txt.equals(",") || c.txt.equals(":")) {
                comma.add(c);
            }
        }
        return comma;
    }

    /**
     * Fixing false positive VP due to parser errors
     * @param ls: list of chunks 
     */
    private void groupVerbPhrase(List ls) {
        int i = 0;
        Chunk c, prev = null, next;
        List remove = new ArrayList();
        while (i < ls.size()) {
            c = ls.get(i);
            if (c.type.equals("VP") && c.begin == c.end) {
                if (!pos_tags[c.begin].startsWith("VB")) {
                    if (prev != null && prev.type.equals("NP")) {
                        prev.merge(c);
                        remove.add(c);
                        i++;
                        if (i < ls.size()) {
                            c = ls.get(i);
                            if (c.type.equals("NP")) {
                                prev.merge(c);
                                remove.add(c);
                            }
                            i++;
                            if (i < ls.size()) {
                                c = ls.get(i);
                            }
                        }
                    } else if (i + 1 < ls.size()) {
                        next = ls.get(i + 1);
                        if (next.type.equals("NP")) {
                            c.merge(next);
                            c.type = "NP";
                            remove.add(next);
                            i++;
                            if (i < ls.size()) {
                                c = ls.get(i + 1);
                            }
                        } else if (pos_tags[c.begin].startsWith("NN")) {
                            c.type = "NP";
                        }
                    } else {
                        System.out.println("PARSER: Fixing verb phrase: ----->Unknown case: " + c.getValues());
                    }
                }
            } else if (c.type.equals("NP") && c.begin == c.end) {
                if (pos_tags[c.begin].startsWith("VB")) {
                    c.type = pos_tags[c.begin];
                }
            }
            // now remove ADVP in front of VP
            if (c.type.equals("VP")) {
                if (prev != null && prev.type.equals("ADVP") && !breakMap.contains(prev.txt)) {
                    if (prev.pros.isEmpty() && prev.trigs.isEmpty()) {
                        remove.add(prev);
                    }
                }
                if (i < ls.size() - 1) { // behind
                    next = ls.get(i + 1);
                    if (next.type.equals("ADVP") && !breakMap.contains(next.txt)) {
                        if (next.pros.isEmpty() && next.trigs.isEmpty()) {
                            remove.add(next);
                            i++;
                            if (i < ls.size() - 1) {
                                next = ls.get(i + 1);
                            }
                        }
                    } else if (next.type.equals("ADJP")) {
                        c.merge(next);
                        remove.add(next);
                        i++;
                        if (i < ls.size() - 1) {
                            next = ls.get(i + 1);
                        }
                    } else if (next.type.equals("VP") && next.txt.startsWith("to ")) {
                        c.merge(next);
                        remove.add(next);
                        i++;
                    } else if (prepMap.contains(next.txt)) {
                        if (i + 3 < ls.size()) {
                            Chunk tmp1 = ls.get(i + 2);
                            Chunk tmp2 = ls.get(i + 3);
                            if (ccMap.contains(tmp1.txt) && tmp2.type.equals("VP")) {
                                c.merge(next);
                                c.merge(tmp1);
                                c.merge(tmp2);
                                remove.add(next);
                                remove.add(tmp1);
                                remove.add(tmp2);
                                i += 3;
                            }
                        }
                    }
                }
            }
            prev = c;
            i++;
        }
        for (Chunk ch : remove) {
            ls.remove(ch);
        }

    }

    private boolean checkCoord(int start, List ls, int end) {
        int count = 0;
        int conj=0;
        Chunk c;
        if (end + 1 < ls.size() && ls.get(end + 1).type.equals("NP")) {
            for (int i = end - 1; i >= start; i--) {
                c = ls.get(i);
                if (c.txt.equals(",")) {
                    continue;
                } else if (c.type.equals("NP")) {
                    count++;
                } else if(ccMap.contains(c.txt)) {
                    conj++;
                }else {
                    break;
                }
            }
            if (count >= 1 && conj>=1) {
                return true;
            }
        }
        return false;
    }

    public static void main(String[] args) {
        Parser parser = new Parser();
        String txt = "To investigate whether PRO4 can be expressed by any T cell subset or if expression is restricted to a distinct lineage, PRO5 mRNA expression was analyzed in freshly isolated T cells such as PRO6-depleted PRO7+ cells, PRO8+ naive or PRO9+ memory T cells (Figure 1A), as well as T cells driven in vitro toward Th1, Th2, or iTreg phenotypes (Figure 1B; phenotype on Figure S1).";
        String tokens[] = parser.splitWord(txt);
        String tags[] = parser.POSTag(tokens);
        List ls = parser.parse(tokens, tags);
        ChunkAnalyzer analyzer = new ChunkAnalyzer();
        analyzer.printChunk(ls);
        System.out.println("");
        analyzer.analyzeChunk(ls, tags, tokens);
        for (BasicChunk bs : analyzer.bsList) {
            bs.printChunk();
            System.out.println("");
        }
        BasicChunk bc = new BasicChunk();
        for (VerbChunk vbc : analyzer.verbList) {
            //bc.findCoordinator(vbc.subject.chunkList);
            vbc.print();
            System.out.println("");
        }
    }
    boolean debug = false;

    public Set prepMap = new HashSet(15);
    public Set appoMap = new HashSet(5);
    public Set breakMap = new HashSet(25);
    public Set skipMap = new HashSet(10);
    public Set is_a_Map = new HashSet(10);
    public Set extraMap = new HashSet(10);
    public Set ccMap = new HashSet(10);
    public Set beMap = new HashSet(10);
    public Set allMap = new HashSet(50);
    String prepList[] = {"to", "with", "from", "of", "on", "in", "upon", "by", "for", "after", "through", "between","via"};
    String skipList[] = {"not", "neither", "-", "\"", "'", "both", "also", "nor", "(", "]", "[", ")"};
    String breakList[] = {"while", "when", "whereas", "although", "if", "because", "even though", "whether", "since", "that",
        "whenever", "whatever", "before", "how", "which", ";", "."};// start a new clause
    String ccList[] = {"and", "or", "but", "as well as", "but not"};
    // Some breakers are not used at this moment: as , after
    String appoList[] = {"like", "such as", "including", "includes","containing"};
    String to_be[] = {"be", "is", "are", "was", "were", "been"};
    String is_a[] = {"had", "have", "has"}; //+ tobe
}