typelevel · isomarcte · Feb 5, 2022 · Feb 6, 2022 · Feb 6, 2022 · Feb 6, 2022
diff --git a/bench/src/main/scala/com/rossabaker/ci/bench/CaseFoldedStringBench.scala b/bench/src/main/scala/com/rossabaker/ci/bench/CaseFoldedStringBench.scala
@@ -0,0 +1,44 @@
+package org.typelevel.ci
+package bench
+
+import org.scalacheck._
+import org.typelevel.ci.testing.arbitraries._
+import cats._
+import org.openjdk.jmh.annotations._
+import java.util.concurrent.TimeUnit
+
+@State(Scope.Thread)
+@BenchmarkMode(Array(Mode.Throughput, Mode.AverageTime))
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+class CaseFoldedStringBench {
+
+  var currentSeed: Long = Long.MinValue
+
+  def nextSeed: Long = {
+    val seed = currentSeed
+    currentSeed += 1L
+    seed
+  }
+
+  def nextString: String =
+    Arbitrary.arbitrary[String].apply(Gen.Parameters.default, rng.Seed(nextSeed)).getOrElse(throw new AssertionError("Failed to generate String."))
+
+  def nextListOfString: List[String] =
+    Gen.listOf(Arbitrary.arbitrary[String])(Gen.Parameters.default, rng.Seed(nextSeed)).getOrElse(throw new AssertionError("Failed to generate String."))
+
+  @Benchmark
+  def caseFoldedStringHash: Int =
+    CaseFoldedString(nextString).hashCode
+
+  @Benchmark
+  def caseFoldedStringFoldMap: CaseFoldedString =
+    Foldable[List].foldMap(nextListOfString)(CaseFoldedString.apply)
+
+  @Benchmark
+  def stringHash: Int =
+    nextString.hashCode
+
+  @Benchmark
+  def stringFoldMap: String =
+    Foldable[List].foldMap(nextListOfString)(identity)
+}
diff --git a/build.sbt b/build.sbt
@@ -71,9 +71,12 @@ lazy val bench = project
   .enablePlugins(NoPublishPlugin)
   .enablePlugins(JmhPlugin)
   .settings(
-    name := "case-insensitive-bench"
+    name := "case-insensitive-bench",
+    libraryDependencies ++= List(
+      "org.scalacheck" %% "scalacheck" % scalacheckV
+    )
   )
-  .dependsOn(core.jvm)
+  .dependsOn(core.jvm, testing.jvm)
 
 lazy val docs = project
   .in(file("site"))

diff --git a/core/src/main/scala/org/typelevel/ci/CIString.scala b/core/src/main/scala/org/typelevel/ci/CIString.scala
@@ -24,52 +24,46 @@ import scala.math.Ordered
 
 /** A case-insensitive String.
   *
-  * Two CI strings are equal if and only if they are the same length, and each corresponding
-  * character is equal after calling either `toUpper` or `toLower`.
+  * Comparisions are based on the case folded representation of the `String`
+  * as defined by the Unicode standard. See [[CaseFoldedString]] for a full
+  * discussion on those rules.
   *
-  * Ordering is based on a string comparison after folding each character to uppercase and then back
-  * to lowercase.
-  *
-  * All comparisons are insensitive to locales.
+  * @note This class differs from [[CaseFoldedString]] in that it keeps a
+  *       reference to original input `String` in whatever form it was
+  *       given. This makes [[CIString]] useful if you which to perform case
+  *       insensitive operations on a `String`, but then recover the original,
+  *       unaltered form. If you do not care about the original input form,
+  *       and just want a single case insensitive `String` value, then
+  *       [[CaseFoldedString]] is more efficient and you should consider using
+  *       that directly.
   *
   * @param toString
   *   The original value the CI String was constructed with.
   */
-final class CIString private (override val toString: String)
+final class CIString private (override val toString: String, val asCaseFoldedString: CaseFoldedString)
     extends Ordered[CIString]
     with Serializable {
+
+  @deprecated(message = "Please provide a CaseFoldedString directly.", since = "1.3.0")
+  private def this(toString: String) = {
+    this(toString, CaseFoldedString(toString))
+  }
+
   override def equals(that: Any): Boolean =
     that match {
       case that: CIString =>
-        this.toString.equalsIgnoreCase(that.toString)
+        // Note java.lang.String.equalsIgnoreCase _does not_ handle all title
+        // case unicode characters, so we can't use it here. See the tests for
+        // an example.
+        this.asCaseFoldedString == that.asCaseFoldedString
       case _ => false
     }
 
-  @transient private[this] var hash = 0
-  override def hashCode(): Int = {
-    if (hash == 0)
-      hash = calculateHash
-    hash
-  }
-
-  private[this] def calculateHash: Int = {
-    var h = 17
-    var i = 0
-    val len = toString.length
-    while (i < len) {
-      // Strings are equal igoring case if either their uppercase or lowercase
-      // forms are equal. Equality of one does not imply the other, so we need
-      // to go in both directions. A character is not guaranteed to make this
-      // round trip, but it doesn't matter as long as all equal characters
-      // hash the same.
-      h = h * 31 + toString.charAt(i).toUpper.toLower
-      i += 1
-    }
-    h
-  }
+  override def hashCode(): Int =
+    asCaseFoldedString.hashCode
 
   override def compare(that: CIString): Int =
-    this.toString.compareToIgnoreCase(that.toString)
+    Ordering[CaseFoldedString].compare(asCaseFoldedString, that.asCaseFoldedString)
 
   def transform(f: String => String): CIString = CIString(f(toString))
 
@@ -87,7 +81,15 @@ final class CIString private (override val toString: String)
 
 @suppressUnusedImportWarningForCompat
 object CIString {
-  def apply(value: String): CIString = new CIString(value)
+
+  def apply(value: String, useTurkicFolding: Boolean): CIString =
+    new CIString(value, CaseFoldedString(value, useTurkicFolding))
+
+  def apply(value: String): CIString =
+    apply(value, false)
+
+  def fromCaseFoldedString(value: CaseFoldedString): CIString =
+    new CIString(value.toString, value)
 
   val empty = CIString("")
 

diff --git a/core/src/main/scala/org/typelevel/ci/CaseFoldedString.scala b/core/src/main/scala/org/typelevel/ci/CaseFoldedString.scala
@@ -0,0 +1,162 @@
+package org.typelevel.ci
+
+import cats._
+import cats.kernel.LowerBounded
+import org.typelevel.ci.compat._
+import scala.annotation.tailrec
+
+/** A case folded `String`. This is a `String` which has been converted into a
+  * state which is suitable for case insensitive matching under the Unicode
+  * standard.
+  *
+  * This type differs from [[CIString]] in that it does ''not'' retain the
+  * original input `String` value. That is, this is a destructive
+  * transformation. You should use [[CaseFoldedString]] instead of
+  * [[CIString]] when you only want the case insensitive `String` and you
+  * never want to return the `String` back into the input value. In such cases
+  * [[CaseFoldedString]] will be more efficient than [[CIString]] as it only
+  * has to keep around a single `String` in memory.
+  *
+  * Case insensitive `String` values under Unicode are not always intuitive,
+  * especially on the JVM. There are three character cases to consider, lower
+  * case, upper case, and title case, and not all Unicode codePoints have all
+  * 3, some only have 2, some only 1. For some codePoints, the JRE standard
+  * operations don't always work as you'd expect.
+  *
+  * {{{
+  * scala> val codePoint: Int = 8093
+  * val codePoint: Int = 8093
+  *
+  * scala> new String(Character.toChars(codePoint))
+  * val res0: String = ᾝ
+  *
+  * scala> res0.toUpperCase
+  * val res1: String = ἭΙ
+  *
+  * scala> res0.toUpperCase.toLowerCase == res0.toLowerCase
+  * val res2: Boolean = false
+  *
+  * scala> Character.getName(res0.head)
+  * val res3: String = GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  *
+  * scala> res0.toUpperCase.toLowerCase.equalsIgnoreCase(res0.toLowerCase)
+  * val res4: Boolean = false
+  * }}}
+  *
+  * In this example, given the Unicode character \u1f9d, converting it to
+  * upper case, then to lower case, is not equal under normal String
+  * equality. `String.equalsIgnoreCase` also does not work correctly by the
+  * Unicode standard.
+  *
+  * Making matters more complicated, for certain Turkic languages, the case
+  * folding rules change. See the Unicode standard for a full discussion of
+  * the topic.
+  *
+  * @note For most `String` values the `toString` form of this is lower case
+  *       (when the given character has more than one case), but this is not
+  *       always the case. Certain Unicode scripts have exceptions to this and
+  *       will be case folded into upper case. If you want/need an only lower
+  *       case `String`, you should call `.toString.toLowerCase`.
+  *
+  * @see [[https://www.unicode.org/versions/Unicode14.0.0/ch05.pdf#G21790]]
+  */
+final case class CaseFoldedString private (override val toString: String) extends AnyVal {
+
+  def isEmpty: Boolean = toString.isEmpty
+
+  def nonEmpty: Boolean = !isEmpty
+
+  def length: Int = toString.length
+
+  def size: Int = length
+
+  def trim: CaseFoldedString =
+    CaseFoldedString(toString.trim)
+
+  private final def copy(toString: String): CaseFoldedString =
+    CaseFoldedString(toString)
+}
+
+object CaseFoldedString {
+
+  /** Create a [[CaseFoldedString]] from a `String`.
+    *
+    * @param turkicFoldingRules if `true`, use the case folding rules for
+    *                           applicable to some Turkic languages.
+    */
+  def apply(value: String, turkicFoldingRules: Boolean): CaseFoldedString = {
+    val builder: java.lang.StringBuilder = new java.lang.StringBuilder(value.length * 3)
+    val foldCodePoint: Int => Array[Int] =
+      if (turkicFoldingRules) {
+        CaseFolds.turkicFullCaseFoldedCodePoints
+      } else {
+        CaseFolds.fullCaseFoldedCodePoints
+      }
+
+    @tailrec
+    def loop(index: Int): String =
+      if (index >= value.length) {
+        builder.toString
+      } else {
+        val codePoint: Int = value.codePointAt(index)
+        foldCodePoint(codePoint).foreach(c => builder.appendCodePoint(c))
+        val inc: Int = if (codePoint >= 0x10000) 2 else 1
+        loop(index + inc)
+      }
+
+    new CaseFoldedString(loop(0))
+  }
+
+  /** Create a [[CaseFoldedString]] from a `String`.
+    *
+    * @note This factory method does ''not'' use the Turkic case folding
+    *       rules. For the majority of languages this is the correct method of
+    *       case folding. If you know your `String` is specific to one of the
+    *       Turkic languages which use special case folding rules, you can use
+    *       the secondary factory method to enable case folding under those
+    *       rules.
+    */
+  def apply(value: String): CaseFoldedString =
+    apply(value, false)
+
+  val empty: CaseFoldedString =
+    CaseFoldedString("")
+
+  implicit val hashAndOrderForCaseFoldedString: Hash[CaseFoldedString] with Order[CaseFoldedString] =
+    new Hash[CaseFoldedString] with Order[CaseFoldedString] {
+      override def hash(x: CaseFoldedString): Int =
+        x.hashCode
+
+      override def compare(x: CaseFoldedString, y: CaseFoldedString): Int =
+        x.toString.compare(y.toString)
+    }
+
+  implicit val orderingForCaseFoldedString: Ordering[CaseFoldedString] =
+    hashAndOrderForCaseFoldedString.toOrdering
+
+  implicit val showForCaseFoldedString: Show[CaseFoldedString] =
+    Show.fromToString
+
+  implicit val lowerBoundForCaseFoldedString: LowerBounded[CaseFoldedString] =
+    new LowerBounded[CaseFoldedString] {
+      override val partialOrder: PartialOrder[CaseFoldedString] =
+        hashAndOrderForCaseFoldedString
+
+      override val minBound: CaseFoldedString =
+        empty
+    }
+
+  implicit val monoidForCaseFoldedString: Monoid[CaseFoldedString] =
+    new Monoid[CaseFoldedString] {
+      override val empty: CaseFoldedString = CaseFoldedString.empty
+
+      override def combine(x: CaseFoldedString, y: CaseFoldedString): CaseFoldedString =
+        new CaseFoldedString(x.toString + y.toString)
+
+      override def combineAll(xs: IterableOnce[CaseFoldedString]): CaseFoldedString = {
+        val sb: StringBuilder = new StringBuilder
+        xs.iterator.foreach(cfs => sb.append(cfs.toString))
+        new CaseFoldedString(sb.toString)
+      }
+    }
+}