twitter-the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/tweettext/TextModification.scala

package com.twitter.tweetypie.tweettext

import scala.util.matching.Regex

object TextModification {

  /**
   * Lift a text into a TextModification where `original` and `updated` text are the same
   * and `replacements` is empty.
   */
  def identity(text: String): TextModification =
    TextModification(original = text, updated = text, replacements = Nil)

  /**
   * Replace each substring that matches the regex with the substitution string, returns a
   * TextModification object that contains the updated text and enough information to also
   * update entity indices.
   *
   * This method should correctly be taking into account surrogate-pairs.  The returned
   * TextModification object has code-point offsets, instead of code-unit offsets.
   */
  def replaceAll(text: String, regex: Regex, substitution: String): Option[TextModification] =
    replaceAll(text, regex -> substitution)

  /**
   * Replaces substrings that match the given `Regex` with the corresonding substitution
   * string.  Returns a `TextModification` that can be used to reindex entities.
   */
  def replaceAll(
    text: String,
    regexAndSubstitutions: (Regex, String)*
  ): Option[TextModification] = {
    val matches =
      (for {
        (r, s) <- regexAndSubstitutions
        m <- r.findAllIn(text).matchData
      } yield (m, s)).sortBy { case (m, _) => m.start }

    if (matches.isEmpty) {
      // no match found, return None to indicate no modifications made
      None
    } else {
      val replacements = List.newBuilder[TextReplacement]
      val indexConverter = new IndexConverter(text)
      // contains the retained text, built up as we walk through the regex matches
      val buf = new StringBuilder(text.length)
      // the number of code-points copied into buf
      var codePointsCopied = Offset.CodePoint(0)
      // always holds the start code-unit offset to copy to buf when we encounter
      // either a regex match or end-of-string.
      var anchor = 0

      import indexConverter.toCodePoints

      for ((m, sub) <- matches) {
        val unchangedText = text.substring(anchor, m.start)
        val unchangedLen = Offset.CodePoint.length(unchangedText)
        val subLen = Offset.CodePoint.length(sub)

        // copies the text upto the regex match run, plus the replacement string
        buf.append(unchangedText).append(sub)
        codePointsCopied += unchangedLen + subLen

        // the offsets indicate the indices of the matched string in the original
        // text, and the indices of the replacement string in the updated string
        replacements +=
          TextReplacement(
            originalFrom = toCodePoints(Offset.CodeUnit(m.start)),
            originalTo = toCodePoints(Offset.CodeUnit(m.end)),
            updatedFrom = codePointsCopied - subLen,
            updatedTo = codePointsCopied
          )

        anchor = m.end
      }

      buf.append(text.substring(anchor))

      Some(TextModification(text, buf.toString, replacements.result()))
    }
  }

  /**
   * Inserts a string at a specified code point offset.
   * Returns a `TextModification` that can be used to reindex entities.
   */
  def insertAt(
    originalText: String,
    insertAt: Offset.CodePoint,
    textToInsert: String
  ): TextModification = {
    val insertAtCodeUnit = insertAt.toCodeUnit(originalText).toInt
    val (before, after) = originalText.splitAt(insertAtCodeUnit)
    val updatedText = s"$before$textToInsert$after"
    val textToInsertLength = TweetText.codePointLength(textToInsert)

    TextModification(
      original = originalText,
      updated = updatedText,
      replacements = List(
        TextReplacement.fromCodePoints(
          originalFrom = insertAt.toInt,
          originalTo = insertAt.toInt,
          updatedFrom = insertAt.toInt,
          updatedTo = insertAt.toInt + textToInsertLength
        ))
    )
  }
}

/**
 * Encodes information about insertions/deletions/replacements made to a string, providing
 * the original string, the updated string, and a list of TextReplacement objects
 * that encode the indices of the segments that were changed.  Using this information,
 * it is possible to map an offset into the original string to an offset into the updated
 * string, assuming the text at the offset was not within one of the modified segments.
 *
 * All offsets are code-points, not UTF6 code-units.
 */
case class TextModification(
  original: String,
  updated: String,
  replacements: List[TextReplacement]) {
  private val originalLen = Offset.CodePoint.length(original)

  /**
   * Using an offset into the original String, computes the equivalent offset into the updated
   * string.  If the offset falls within a segment that was removed/replaced, None is returned.
   */
  def reindex(index: Offset.CodePoint): Option[Offset.CodePoint] =
    reindex(index, Offset.CodePoint(0), replacements)

  /**
   * Reindexes an entity of type T.  Returns the updated entity, or None if either the `fromIndex`
   * or `toIndex` value is now out of range.
   */
  def reindexEntity[T: TextEntity](e: T): Option[T] =
    for {
      from <- reindex(Offset.CodePoint(TextEntity.fromIndex(e)))
      to <- reindex(Offset.CodePoint(TextEntity.toIndex(e) - 1))
    } yield TextEntity.move(e, from.toShort, (to.toShort + 1).toShort)

  /**
   * Reindexes a sequence of entities of type T.  Some entities could be filtered
   * out if they span a region of text that has been removed.
   */
  def reindexEntities[T: TextEntity](es: Seq[T]): Seq[T] =
    for (e <- es; e2 <- reindexEntity(e)) yield e2

  /**
   * Swaps `original` and `updated` text and inverts all `TextReplacement` instances.
   */
  def inverse: TextModification =
    TextModification(updated, original, replacements.map(_.inverse))

  // recursively walks through the list of TextReplacement objects computing
  // offsets to add/substract from 'shift', which accumulates all changes and
  // then gets added to index at the end.
  private def reindex(
    index: Offset.CodePoint,
    shift: Offset.CodePoint,
    reps: List[TextReplacement]
  ): Option[Offset.CodePoint] =
    reps match {
      case Nil =>
        if (index.toInt >= 0 && index <= originalLen)
          Some(index + shift)
        else
          None
      case (r @ TextReplacement(fr, to, _, _)) :: tail =>
        if (index < fr) Some(index + shift)
        else if (index < to) None
        else reindex(index, shift + r.lengthDelta, tail)
    }
}

object TextReplacement {
  def fromCodePoints(
    originalFrom: Int,
    originalTo: Int,
    updatedFrom: Int,
    updatedTo: Int
  ): TextReplacement =
    TextReplacement(
      Offset.CodePoint(originalFrom),
      Offset.CodePoint(originalTo),
      Offset.CodePoint(updatedFrom),
      Offset.CodePoint(updatedTo)
    )
}

/**
 * Encodes the indices of a segment of text in one string that maps to a replacement
 * segment in an updated version of the text.  The replacement segment could be empty
 * (updatedTo == updatedFrom), indicating the segment was removed.
 *
 * All offsets are code-points, not UTF16 code-units.
 *
 * `originalFrom` and `updatedFrom` are inclusive.
 * `originalTo` and `updatedTo` are exclusive.
 */
case class TextReplacement(
  originalFrom: Offset.CodePoint,
  originalTo: Offset.CodePoint,
  updatedFrom: Offset.CodePoint,
  updatedTo: Offset.CodePoint) {
  def originalLength: Offset.CodePoint = originalTo - originalFrom
  def updatedLength: Offset.CodePoint = updatedTo - updatedFrom
  def lengthDelta: Offset.CodePoint = updatedLength - originalLength

  def shiftOriginal(offset: Offset.CodePoint): TextReplacement =
    copy(originalFrom = originalFrom + offset, originalTo = originalTo + offset)

  def shiftUpdated(offset: Offset.CodePoint): TextReplacement =
    copy(updatedFrom = updatedFrom + offset, updatedTo = updatedTo + offset)

  def shift(offset: Offset.CodePoint): TextReplacement =
    TextReplacement(
      originalFrom + offset,
      originalTo + offset,
      updatedFrom + offset,
      updatedTo + offset
    )

  def inverse: TextReplacement =
    TextReplacement(
      originalFrom = updatedFrom,
      originalTo = updatedTo,
      updatedFrom = originalFrom,
      updatedTo = originalTo
    )
}