fr.janalyse.split.StringSplit.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of split_3 Show documentation
split
The newest version!
/*
 * Copyright 2011-2023 David Crosson, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package fr.janalyse.split

import annotation.tailrec

object StringSplit {
  
  /**
   * split a string into a tree of substrings
   *    
   * @param input the string to split
   * @return the tree of substrings
   */
  final def treezer(
      input:String,
      isSeparator:(Char)=>Boolean=defaultIsSeparator,
      isQuoted:(Char)=>Option[Char]=defaultIsQuoted,
      isSubGroup:(Char)=>Option[Char]=defaultIsSubGroup,
      concatCheck:(String)=>Boolean=defaultConcatCheck
      ):SplitItem = {
    val it = input.trim.iterator
    
    //@tailrec
    def worker(buf:String, accumulator:List[SplitItem], subGroupEndMarker:Option[Char]):SplitItem = {
      if (it.hasNext) {
        val ch = it.next()
        if (subGroupEndMarker.isDefined && subGroupEndMarker.get == ch) {
          buf match {
            case "" => SplitNode(accumulator)
            case _  => SplitNode(accumulator:+SplitWord(buf))
          }
        } else if (isSeparator(ch) && concatCheck(buf)) {
          worker(buf+ch, accumulator,subGroupEndMarker)
        } else if (isSeparator(ch)) {
          buf match {
            case "" => worker("", accumulator,subGroupEndMarker)
            case _ => worker("", accumulator:+SplitWord(buf),subGroupEndMarker)
          }
        } else {
          isSubGroup(ch) match {
            case Some(endMarker)=>
              val childnode = worker("", Nil, Option(endMarker))
              buf match {
                case "" => worker("", accumulator:+childnode, subGroupEndMarker)
                case _ => worker("", accumulator:+SplitWord(buf):+childnode, subGroupEndMarker)
              }
            case None =>
              // TODO Add quoted block processing
              worker(buf+ch, accumulator,subGroupEndMarker)
          }
        }
      } else {
        buf match {
          case "" => SplitNode(accumulator)
          case _  => SplitNode(accumulator:+SplitWord(buf))
        }
      }
    }
    
    worker("",Nil,None)
  }
  
  sealed trait SplitItem {
    def hasChild:Boolean
    def hasContent:Boolean
  }
  
  case class SplitWord(content:String) extends SplitItem {
    def hasChild=false
    def hasContent=true
  }
  
  case class SplitNode(children:List[SplitItem]) extends SplitItem {
    def hasChild = !children.isEmpty
    def hasContent=false
  }
  object SplitNode {
    def apply():SplitNode = SplitNode(Nil)
    def apply(child:SplitItem):SplitNode = SplitNode(child::Nil)
    def apply(children:SplitItem*):SplitNode = SplitNode(children.toList)
  }

  def defaultIsQuoted(ch:Char):Option[Char]=
    ch match {
      case '"' | '\'' => Some(ch)
      case _ => None
    }
  
  def defaultIsSeparator(ch:Char):Boolean =
    ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'
  
  
  def defaultConcatCheck(current:String):Boolean = 
    current.trim.endsWith(",")
  
  
  def defaultIsSubGroup(ch:Char):Option[Char] = {
    ch match {
      case '[' => Some(']')
      case '(' => Some(')')
      case '{' => Some('}')
      case _ => None
    }
  }
  
  
  
  
  /**
   * Smart space oriented string split that takes into account comma, quote, double quotes and brackets.
   *
   * @param line the line to split into a vector a substrings
   * @param maxcount max size for the results vector 
   * @return substrings
   */
  @deprecated("Use split method instead of tokenizer", "0.3.2")
  final def tokenizer(line: String,maxcount:Int=0): Vector[String] = tokenize(line.trim, maxcount=maxcount)

  /**
   * Smart space oriented string split that takes into account comma, quote, double quotes and brackets.
   *
   * @param line the line to split into a vector a substrings
   * @param maxcount max size for the results vector 
   * @return vector of substrings
   */
  final def split(line: String, maxcount:Int=0): Vector[String] = tokenize(line.trim, maxcount=maxcount)

  
  
  @tailrec
  private final def indexWhere(str: String, look4char:Char, pos: Int = 0): Int = {
    val sz = str.size - pos
    if (sz == 0) -1
    else if (str(pos)==look4char && (sz==1 || spacecheck(str(pos+1)))) pos
    else if (str.head == '\\' && sz > 1) indexWhere(str, look4char, pos + 2)
    else indexWhere(str, look4char, pos + 1)
  }

  private final def partition(str: String, pos:Int, look4char:Char): Tuple2[String, Int] = {
    indexWhere(str, look4char, pos=pos) match {
      case -1 => (str.substring(pos), str.size)
      case i  => (str.substring(pos, i), i + 1)
    }
  }

  private final def charCounterPart(ch:Char):Char = {
    (ch match {
      case '"'  => '"'
      case '\'' => '\''
      case '['  => ']'
      case '('  => ')'
      case '{'  => '}'  
    })
  }
  
//  private final def groupedCheck(ch: Char)(testWith: String, pos:Int): Boolean = {
//    (ch match {
//      case '"'  => '"'
//      case '\'' => '\''
//      case '['  => ']'
//      case '('  => ')'
//      case '{'  => '}'  
//    }) == testWith(pos) && {
//      testWith.size - pos == 1 || spacecheck(testWith(pos+1))
//    }
//  }
  
  private final def spacecheck(curchar:Char):Boolean = {
    curchar == ' ' || curchar == '\r' || curchar == '\t'
  }

  
  @tailrec
  private final def partitionSpaceWithoutComma(str: String, pos:Int = 0, relative: Int = 0, commafound: Boolean = false, spacefound: Boolean = false): Tuple2[String, Int] = {
    if (str.size == pos+relative) (str.substring(pos), pos+relative)
    else {
      val curchar = str.charAt(pos+relative)
      if (curchar == ',' || curchar == ';') {
        partitionSpaceWithoutComma(str, pos, relative + 1, true, spacefound)
      } else if (spacecheck(curchar)) {
        partitionSpaceWithoutComma(str, pos, relative + 1, commafound, true)
      } else {
        if (spacefound && !commafound) {
          (str.substring(pos, pos+relative).trim, pos+relative)
        } else {
          partitionSpaceWithoutComma(str, pos, relative + 1, false, false)
        }
      }
    }
  }
  
  @tailrec
  private final def tokenize(me: String, maxcount:Int, pos:Int=0, accumulator: Vector[String] = Vector.empty): Vector[String] = {
    if (pos >= me.size) accumulator
    else if (maxcount>0 && accumulator.size + 1 == maxcount) accumulator :+ me.substring(pos).trim()
    else {      
      val ch = me(pos)
      if (ch == ' ' || ch == '\r' | ch == '\t') tokenize(me, maxcount, pos+1, accumulator)
      else if ( ch == '"' || ch== '\'' || ch == '[' || ch== '(' || ch == '{') {
          val (selected, newpos) = partition(me, pos+1, charCounterPart(ch))
          tokenize(me, maxcount, newpos, accumulator :+ selected)
      } else {
          val (selected, newpos) = partitionSpaceWithoutComma(me, pos)
          tokenize(me, maxcount, newpos, accumulator :+ selected)
      }

    }
  }  
}