ca.uhn.fhir.util.StringUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hapi-fhir-base Show documentation
There is a newer version: 7.4.5
/*-
 * #%L
 * HAPI FHIR - Core Library
 * %%
 * Copyright (C) 2014 - 2024 Smile CDR, Inc.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
package ca.uhn.fhir.util;

import jakarta.annotation.Nonnull;

import java.io.CharArrayWriter;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.Arrays;

public class StringUtil {

	/**
	 * If a string ends with a given character, remove that character from the end of the string (as many times as it occurs at the end)
	 */
	public static String chompCharacter(String theInput, char theCharacter) {
		String retVal = theInput;
		while (retVal != null && retVal.length() > 0 && retVal.charAt(retVal.length() - 1) == theCharacter) {
			retVal = retVal.substring(0, retVal.length() - 1);
		}
		return retVal;
	}

	public static String normalizeStringForSearchIndexing(String theString) {
		if (theString == null) {
			return null;
		}

		CharArrayWriter outBuffer = new CharArrayWriter(theString.length());

		/*
		 * The following block of code is used to strip out diacritical marks from latin script
		 * and also convert to upper case. E.g. "j?mes" becomes "JAMES".
		 *
		 * See http://www.unicode.org/charts/PDF/U0300.pdf for the logic
		 * behind stripping 0300-036F
		 *
		 * See #454 for an issue where we were completely stripping non latin characters
		 * See #832 for an issue where we normalize korean characters, which are decomposed
		 */
		String string = Normalizer.normalize(theString, Normalizer.Form.NFD);
		for (int i = 0, n = string.length(); i < n; ++i) {
			char c = string.charAt(i);
			if (c >= '\u0300' && c <= '\u036F') {
				continue;
			} else {
				outBuffer.append(c);
			}
		}

		return new String(outBuffer.toCharArray()).toUpperCase();
	}

	public static String toUtf8String(byte[] theBytes) {
		byte[] bytes = theBytes;
		if (theBytes.length >= 3) {
			if (theBytes[0] == -17 && theBytes[1] == -69 && theBytes[2] == -65) {
				bytes = Arrays.copyOfRange(theBytes, 3, theBytes.length);
			}
		}
		return new String(bytes, StandardCharsets.UTF_8);
	}

	/**
	 * Gets the string prefix of the specified length.
	 *
	 * @param theString
	 * 	String to get the prefix from
	 * @param theCodePointCount
	 * 	Length of the prefix in code points
	 * @return
	 * 	Returns the string prefix of the specified number of codepoints.
	 */
	public static String left(String theString, int theCodePointCount) {
		if (theString == null) {
			return null;
		}

		if (theCodePointCount < 0) {
			return "";
		}

		// char count can only be bigger than the code point count
		if (theString.length() <= theCodePointCount) {
			return theString;
		}

		return theString.substring(0, theString.offsetByCodePoints(0, theCodePointCount));
	}

	@Nonnull
	public static String prependLineNumbers(@Nonnull String theInput) {
		StringBuilder schemaOutput = new StringBuilder();
		int index = 0;
		for (String next : theInput.split("\\n")) {
			schemaOutput
					.append(index++)
					.append(": ")
					.append(next.replace("\r", ""))
					.append("\n");
		}
		return schemaOutput.toString();
	}
}