All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.dashscope.utils.StringUtils Maven / Gradle / Ivy

The newest version!
package com.alibaba.dashscope.utils;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

public class StringUtils {
  /*
   * * Split src with spliter, return the every part of include spliter eg: src
   * "<|im_start|>system", spliter: <|im_start|> the result is: ["<|im_start|>",
   * "system"] used in tokenizer.
   */
  public static List splitByString(String src, String spliter) {
    List parts = new ArrayList<>();
    int from = 0;
    int first = src.indexOf(spliter, from);
    while (first != -1) {
      if (from == first) { // starts with special
        parts.add(spliter);
        from += spliter.length();
      } else {
        parts.add(src.substring(from, first));
        parts.add(spliter);
        from += first - from + spliter.length();
      }
      first = src.indexOf(spliter, from);
    }
    String remain = src.substring(from);
    if (remain.length() > 0) {
      parts.add(src.substring(from));
    }
    return parts;
  }

  /*
   * * Split text by list of string. eg: "<|im_start|>system\nYour are a helpful
   * assistant.<|im_end|>\n<|im_start|>user\nSanFrancisco is
   * a<|im_end|>\n<|im_start|>assistant\n"; spliters: ["<|im_start|>", "<|im_end|>"]
   * result: ["<|im_end|>","system\nYour are a helpful assistant.", "<|im_end|>",
   * "\n","<|im_start|>", "user\nSanFrancisco is a", "<|im_end|>", "\n", "<|im_start|>",
   * "assistant\n" ]
   */
  public static List splitByStrings(String text, Collection spliters) {
    List chunks = new ArrayList<>();
    chunks.add(text);
    for (String specialToken : spliters) {
      List thisSplits = new ArrayList<>();
      for (String chunk : chunks) {
        thisSplits.addAll(splitByString(chunk, specialToken));
      }
      chunks = thisSplits;
    }
    return chunks;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy