All Downloads are FREE. Search and download functionalities are using the official Maven repository.

scales.aalto.parser.pull.AsyncParser.scala Maven / Gradle / Ivy

The newest version!
package scales.aalto.parser.pull

import scales.xml._
import scales.utils._
import io._
import resources._
import com.fasterxml.aalto._
import AsyncXMLStreamReader.EVENT_INCOMPLETE
import javax.xml.stream.XMLStreamConstants.END_DOCUMENT

import java.nio.ByteBuffer
import java.nio.channels.ReadableByteChannel

import scales.xml.parser.pull.PullUtils

import scalaz.EphemeralStream
import EphemeralStream.emptyEphemeralStream

import scales.xml.parser.strategies.{MemoryOptimisationStrategy, OptimisationToken}

/**
 * Carbon copy of IterV Input, its really quite useful thanks!
 */ 
sealed trait Input[E] {
  def apply[Z](empty: => Z, el: (=> E) => Z, eof: => Z): Z
}

/** Input that has a value available */
object Empty {
  def apply[E] : Input[E] = new Input[E] {
    def apply[Z](empty: => Z, el: (=> E) => Z, eof: => Z): Z = empty
  }
  def unapply[E](r: Input[E]): Boolean =
    r.apply[Either[Input[E], Boolean]](
      empty = Right(true),
      el = e => Left(El(e)),
      eof = Left(EOF[E])).fold(x => false, x => x)
}

/** Input that has no values available  */
object El {
  def apply[E](e0: => E): Input[E] = new Input[E] {
    def apply[Z](empty: => Z, el: (=> E) => Z, eof: => Z): Z = el(e0)
  }
  def unapply[E](r: Input[E]): Option[E] =
    r.apply[Either[Input[E], (E)]](
      empty = Left(Empty[E]),
      el = e => Right(e),
      eof = Left(EOF[E])).right.toOption
}

/** Input that is exhausted */
object EOF {
  def apply[E] : Input[E] = new Input[E] {
    def apply[Z](empty: => Z, el: (=> E) => Z, eof: => Z): Z = eof
  }
  def unapply[E](r: Input[E]): Boolean =
    r.apply[Either[Input[E], Boolean]](
      empty = Left(Empty[E]),
      el = e => Left(El(e)),
      eof = Right(true)).fold(x => false, x => x)
}


/**
 * An AynscParser, a DataChunk is fed in via nextInput which, in turn, returns an Input[EphmeralStream[PullType]] of events.
 * When the Input is El then the stream may be evaluated to get all available events, and will return empty when no more for that data chunk is available.
 * 
 * See nextInput for more info.
 */
abstract class AsyncParser(implicit xmlVersion : XmlVersion) extends CloseOnNeed with DocLike {

  protected type Token <: OptimisationToken

  protected val strategy : MemoryOptimisationStrategy[Token]

  protected val feeder : AsyncInputFeeder

  protected val token : Token

  protected val parser : AsyncXMLStreamReader

  /**
   * Closes the feeder and parser
   */ 
  protected def doClose = { 
    feeder.endOfInput
    parser.close
  }

  // to capture the miscs.
  private[this] var docImpl = EmptyDoc()

  protected def copyProlog(p : Prolog) { 
    docImpl = docImpl.copy( prolog = p )
  }

  protected def addPrologMisc(m : PullType) { 
    copyProlog( 
      prolog.copy(
	misc = prolog.misc :+ PullUtils.getMisc(m, "prolog") 
      )
    )
  }

  protected def addEndMisc(m : PullType) { 
    docImpl = docImpl.copy( 
      end = end.copy( 
	misc = end.misc :+ PullUtils.getMisc(m, "endMisc") 
      )
    )
  }

  def prolog = docImpl.prolog
  def end = docImpl.end

  protected var depth = -1
  protected var started = false

  //private var empties = 0

  /**
   * The document element has been reached
   */ 
  def startedElementProcessing = started
  
  protected val incompOrEnd : PullType = Left(Text("I am incomplete or doc end"))

  protected val eventHandler = (x : Int) => {
    if (x == EVENT_INCOMPLETE || x == END_DOCUMENT)
      incompOrEnd
    else
      error("Got an unexpected event type " + x +" cannot proceed.") 
  }

  /**
   * Pushes through Misc items in either prolog or the epilog
   */
  protected def pumpMisc() : Input[PullType] = {

    val (event, num, odepth, oprolog) = PullUtils.pumpEvent(parser, strategy, token, prolog, depth)(eventHandler)

    depth = odepth

    if (oprolog != prolog) {
      // doc start
      copyProlog( oprolog ) 
    }
    // FUCKING MISC MUST RETURN EMPTY FOR ALL - OPTION SHIT!!!
    if (num == END_DOCUMENT) {
      // EOF - let the iter deal 
      //println("closing")
      closeResource
      //Some(EOF[T])
      EOF[PullType]
    } else if (num == EVENT_INCOMPLETE) {
      //println("event incomplete ")
      Empty[PullType]//None//pumpInMisc
    } else if (odepth == -1) {
      // still misc
      //println("didn't get out of misc , none??")
      
      if (event.isLeft && (event.left.get eq PullUtils.dtdDummy)) {
	copyProlog( prolog.copy(dtd = Some(
	  DTD("", "", "") // DTD has funnyness TODO find out what it looks like
        )))
      } else {
	if (!started)
	  addPrologMisc(event)
	else
	  addEndMisc(event)	      
      }

      Empty[PullType]//None
    } else {
      //println("actually started with first event")
      started = true
      // pump actual first event - yay !!, next depth -1 is endmisc
      El(event)
    }
  }

  protected def pump() : Input[PullType] = {
    if (feeder.needMoreInput) {
      //println("needed more but we still pumped")
    }

    // don't have to re-read, let it push what it has
    if (depth == -1) {
      //println("pumping misc")
      var r = pumpMisc()
      // keep going if there are more events to process and are still in misc
      while(!feeder.needMoreInput && r(el = E => false, empty = true, eof = false)) {
	r = pumpMisc()
      }
      r
    } else {
      // 2nd > events
      val (event, num, odepth, oprolog) = PullUtils.pumpEvent(parser, strategy, token, prolog, depth)(eventHandler)

      depth = odepth

      if (num == END_DOCUMENT) {
	// EOF - let the iter deal -- should not occur here though, only when depth == -1
	//println("closing in pump")
	closeResource
	//Some(EOF[T])
	EOF[PullType]
      } else if (num == EVENT_INCOMPLETE) {
	// let the iter attempt to deal
	//empties += 1
	//Some(IterV.Empty[T])
	Empty[PullType]
      } else {
	// actual data present, odepth -1 is looked at for the next iter
	//Some(El(event))
	El(event)
      }
    }
  }

  // keep going until we get needs more input
  protected def nextStream(): EphemeralStream[PullType] = {
    if (isClosed || feeder.needMoreInput) { // keep num around?
    //  println("got to nextStream and empty")
      emptyEphemeralStream
    } else {
	// push one more off
      val pumped = pump
      pumped(
	el = e => {
//	  println("got some "+e)
	  EphemeralStream.cons[PullType](e, nextStream())
	},
	empty = {
//	  println("pumped all we have")
	  emptyEphemeralStream
	},
	eof = {
//	  println("eof from pump but returned empty")
	  emptyEphemeralStream // next run will pick it up
	}
      ) // 
    }
  }
  /**
   * Given a DataChunk will return the next available data stream.
   * Returns EOF when the Parser is closed, Empty when more DataChunks are needed.
   * Returning El[EphemeralStream[PullType]] provides a stream which will lazily evalulate all available Xml Events from this DataChunk.
   *
   *
   * NOTEs (see Aalto-xml feeder.feedInput for details on this):
   * 1) All existing events from the last El should be consumed before calling nextInput again
   * 2) The DataChunk passed in should not be re-used until the stream returns empty (or Empty/EOF is returned).
   */ 
  def nextInput(d: DataChunk): Input[EphemeralStream[PullType]] = {
    //println("called nextInput")

    if (d.isEmpty) {
      //println("fed empty")
      Empty[EphemeralStream[PullType]]
    } else 
      if (isClosed || d.isEOF) {
	if (d.isEOF) {
	  //println("Data was eof")
	  closeResource
	}
	EOF[EphemeralStream[PullType]]      
      } else {
	if (!feeder.needMoreInput) {
	  error("The stream from the previous call to nextInput was not evaluated ")
	}

	//println("pushing "+d)
	feeder.feedInput(d.array, d.offset, d.length)

	val res = nextStream()

	if (res.isEmpty) { // let the enumerator deal with it
	  //println("empty after feeding")
	  Empty[EphemeralStream[PullType]]
	} else // it may have empty after the call
	  El[EphemeralStream[PullType]](res)
	
    }
  }
  
}

object AsyncParser {

  /**
   * Creates a parser based on the input channel provided
   */
  def apply[TokenT <: OptimisationToken]( optimisationStrategy : MemoryOptimisationStrategy[TokenT] = defaultOptimisation, parsers : Pool[AsyncXMLInputFactory] = AsyncXMLInputFactoryPool )( implicit xmlVersion : XmlVersion ) : AsyncParser = new AsyncParser(){
    type Token = TokenT
    val strategy = optimisationStrategy
    
    import PullUtils.weAreInAParser
    val token = strategy.createToken

    val pf = parsers.grab

    val parser = pf.createAsyncXMLStreamReader()
    val feeder = parser.getInputFeeder

    /**
     * also handle the parsers pool
     */ 
    override protected def doClose = { 
      super.doClose
      parsers.giveBack(pf)
    }

  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy