org.textmining.extraction.word.model.ComplexNodeHelper Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of tm-extractors Show documentation

Java based library that can extract text from Microsoft Word for Windows binary documents including Word 1.0/2.0/4.0/6.0/95/97/2000/xp/2003. Extracts text from fast-saved files as well.

The newest version!

/*
 * Textmining.org text extractors
 * 
 * Copyright (C) 2008 Benryan Software Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */
package org.textmining.extraction.word.model;

import java.util.Collections;
import java.util.List;

public class ComplexNodeHelper extends NodeHelper
{

  public ComplexNodeHelper(TextPieceTable tpt)
  {
    super(tpt);
  }

  public void addChpNodes(List nodes, byte[] grpprl, int fcStart, int fcEnd)
  {    
    for (int x = 0; x < _textPieces.size(); x++)
    {
      TextPiece tp = (TextPiece)_textPieces.get(x);
      
      int pieceFcStart = tp.getFcStart();
      int pieceFcEnd = tp.getFcEnd();
      
      
      if (!(fcStart < pieceFcStart && fcEnd <= pieceFcStart) && 
          !(fcStart >= pieceFcEnd && fcEnd > pieceFcEnd))
      {
        int textStart = Math.max(fcStart, pieceFcStart);
        int textEnd = Math.min(fcEnd, pieceFcEnd);
        
        int divisor = (tp.unicode() ? 2 : 1);
        int cpStart = tp.getStart() + (textStart - pieceFcStart)/divisor;
        int cpEnd = cpStart + (textEnd - textStart)/divisor ;
        
        CHPX chpx = new CHPX(cpStart, cpEnd, grpprl);    
        nodes.add(chpx);
      }      
    }   
  }
  
  
  
  public void sortNodes(List nodes, boolean paragraphs)
  {
    Collections.sort(nodes);
    
    if (paragraphs)
    {
      int len = nodes.size();
      int lastEnd = 0;
      for (int x = 0; x < len; x++)
      {
        PropertyNode node = (PropertyNode)nodes.get(x);
        node.setStart(lastEnd);
        lastEnd = node.getEnd();
      }
    }
    
  }  
}