kopia lustrzana https://github.com/twitter/the-algorithm
233 wiersze
8.0 KiB
Scala
233 wiersze
8.0 KiB
Scala
package com.twitter.tweetypie.tweettext
|
|
|
|
import scala.util.matching.Regex
|
|
|
|
object TextModification {
|
|
|
|
/**
|
|
* Lift a text into a TextModification where `original` and `updated` text are the same
|
|
* and `replacements` is empty.
|
|
*/
|
|
def identity(text: String): TextModification =
|
|
TextModification(original = text, updated = text, replacements = Nil)
|
|
|
|
/**
|
|
* Replace each substring that matches the regex with the substitution string, returns a
|
|
* TextModification object that contains the updated text and enough information to also
|
|
* update entity indices.
|
|
*
|
|
* This method should correctly be taking into account surrogate-pairs. The returned
|
|
* TextModification object has code-point offsets, instead of code-unit offsets.
|
|
*/
|
|
def replaceAll(text: String, regex: Regex, substitution: String): Option[TextModification] =
|
|
replaceAll(text, regex -> substitution)
|
|
|
|
/**
|
|
* Replaces substrings that match the given `Regex` with the corresonding substitution
|
|
* string. Returns a `TextModification` that can be used to reindex entities.
|
|
*/
|
|
def replaceAll(
|
|
text: String,
|
|
regexAndSubstitutions: (Regex, String)*
|
|
): Option[TextModification] = {
|
|
val matches =
|
|
(for {
|
|
(r, s) <- regexAndSubstitutions
|
|
m <- r.findAllIn(text).matchData
|
|
} yield (m, s)).sortBy { case (m, _) => m.start }
|
|
|
|
if (matches.isEmpty) {
|
|
// no match found, return None to indicate no modifications made
|
|
None
|
|
} else {
|
|
val replacements = List.newBuilder[TextReplacement]
|
|
val indexConverter = new IndexConverter(text)
|
|
// contains the retained text, built up as we walk through the regex matches
|
|
val buf = new StringBuilder(text.length)
|
|
// the number of code-points copied into buf
|
|
var codePointsCopied = Offset.CodePoint(0)
|
|
// always holds the start code-unit offset to copy to buf when we encounter
|
|
// either a regex match or end-of-string.
|
|
var anchor = 0
|
|
|
|
import indexConverter.toCodePoints
|
|
|
|
for ((m, sub) <- matches) {
|
|
val unchangedText = text.substring(anchor, m.start)
|
|
val unchangedLen = Offset.CodePoint.length(unchangedText)
|
|
val subLen = Offset.CodePoint.length(sub)
|
|
|
|
// copies the text upto the regex match run, plus the replacement string
|
|
buf.append(unchangedText).append(sub)
|
|
codePointsCopied += unchangedLen + subLen
|
|
|
|
// the offsets indicate the indices of the matched string in the original
|
|
// text, and the indices of the replacement string in the updated string
|
|
replacements +=
|
|
TextReplacement(
|
|
originalFrom = toCodePoints(Offset.CodeUnit(m.start)),
|
|
originalTo = toCodePoints(Offset.CodeUnit(m.end)),
|
|
updatedFrom = codePointsCopied - subLen,
|
|
updatedTo = codePointsCopied
|
|
)
|
|
|
|
anchor = m.end
|
|
}
|
|
|
|
buf.append(text.substring(anchor))
|
|
|
|
Some(TextModification(text, buf.toString, replacements.result()))
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Inserts a string at a specified code point offset.
|
|
* Returns a `TextModification` that can be used to reindex entities.
|
|
*/
|
|
def insertAt(
|
|
originalText: String,
|
|
insertAt: Offset.CodePoint,
|
|
textToInsert: String
|
|
): TextModification = {
|
|
val insertAtCodeUnit = insertAt.toCodeUnit(originalText).toInt
|
|
val (before, after) = originalText.splitAt(insertAtCodeUnit)
|
|
val updatedText = s"$before$textToInsert$after"
|
|
val textToInsertLength = TweetText.codePointLength(textToInsert)
|
|
|
|
TextModification(
|
|
original = originalText,
|
|
updated = updatedText,
|
|
replacements = List(
|
|
TextReplacement.fromCodePoints(
|
|
originalFrom = insertAt.toInt,
|
|
originalTo = insertAt.toInt,
|
|
updatedFrom = insertAt.toInt,
|
|
updatedTo = insertAt.toInt + textToInsertLength
|
|
))
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Encodes information about insertions/deletions/replacements made to a string, providing
|
|
* the original string, the updated string, and a list of TextReplacement objects
|
|
* that encode the indices of the segments that were changed. Using this information,
|
|
* it is possible to map an offset into the original string to an offset into the updated
|
|
* string, assuming the text at the offset was not within one of the modified segments.
|
|
*
|
|
* All offsets are code-points, not UTF6 code-units.
|
|
*/
|
|
case class TextModification(
|
|
original: String,
|
|
updated: String,
|
|
replacements: List[TextReplacement]) {
|
|
private val originalLen = Offset.CodePoint.length(original)
|
|
|
|
/**
|
|
* Using an offset into the original String, computes the equivalent offset into the updated
|
|
* string. If the offset falls within a segment that was removed/replaced, None is returned.
|
|
*/
|
|
def reindex(index: Offset.CodePoint): Option[Offset.CodePoint] =
|
|
reindex(index, Offset.CodePoint(0), replacements)
|
|
|
|
/**
|
|
* Reindexes an entity of type T. Returns the updated entity, or None if either the `fromIndex`
|
|
* or `toIndex` value is now out of range.
|
|
*/
|
|
def reindexEntity[T: TextEntity](e: T): Option[T] =
|
|
for {
|
|
from <- reindex(Offset.CodePoint(TextEntity.fromIndex(e)))
|
|
to <- reindex(Offset.CodePoint(TextEntity.toIndex(e) - 1))
|
|
} yield TextEntity.move(e, from.toShort, (to.toShort + 1).toShort)
|
|
|
|
/**
|
|
* Reindexes a sequence of entities of type T. Some entities could be filtered
|
|
* out if they span a region of text that has been removed.
|
|
*/
|
|
def reindexEntities[T: TextEntity](es: Seq[T]): Seq[T] =
|
|
for (e <- es; e2 <- reindexEntity(e)) yield e2
|
|
|
|
/**
|
|
* Swaps `original` and `updated` text and inverts all `TextReplacement` instances.
|
|
*/
|
|
def inverse: TextModification =
|
|
TextModification(updated, original, replacements.map(_.inverse))
|
|
|
|
// recursively walks through the list of TextReplacement objects computing
|
|
// offsets to add/substract from 'shift', which accumulates all changes and
|
|
// then gets added to index at the end.
|
|
private def reindex(
|
|
index: Offset.CodePoint,
|
|
shift: Offset.CodePoint,
|
|
reps: List[TextReplacement]
|
|
): Option[Offset.CodePoint] =
|
|
reps match {
|
|
case Nil =>
|
|
if (index.toInt >= 0 && index <= originalLen)
|
|
Some(index + shift)
|
|
else
|
|
None
|
|
case (r @ TextReplacement(fr, to, _, _)) :: tail =>
|
|
if (index < fr) Some(index + shift)
|
|
else if (index < to) None
|
|
else reindex(index, shift + r.lengthDelta, tail)
|
|
}
|
|
}
|
|
|
|
object TextReplacement {
|
|
def fromCodePoints(
|
|
originalFrom: Int,
|
|
originalTo: Int,
|
|
updatedFrom: Int,
|
|
updatedTo: Int
|
|
): TextReplacement =
|
|
TextReplacement(
|
|
Offset.CodePoint(originalFrom),
|
|
Offset.CodePoint(originalTo),
|
|
Offset.CodePoint(updatedFrom),
|
|
Offset.CodePoint(updatedTo)
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Encodes the indices of a segment of text in one string that maps to a replacement
|
|
* segment in an updated version of the text. The replacement segment could be empty
|
|
* (updatedTo == updatedFrom), indicating the segment was removed.
|
|
*
|
|
* All offsets are code-points, not UTF16 code-units.
|
|
*
|
|
* `originalFrom` and `updatedFrom` are inclusive.
|
|
* `originalTo` and `updatedTo` are exclusive.
|
|
*/
|
|
case class TextReplacement(
|
|
originalFrom: Offset.CodePoint,
|
|
originalTo: Offset.CodePoint,
|
|
updatedFrom: Offset.CodePoint,
|
|
updatedTo: Offset.CodePoint) {
|
|
def originalLength: Offset.CodePoint = originalTo - originalFrom
|
|
def updatedLength: Offset.CodePoint = updatedTo - updatedFrom
|
|
def lengthDelta: Offset.CodePoint = updatedLength - originalLength
|
|
|
|
def shiftOriginal(offset: Offset.CodePoint): TextReplacement =
|
|
copy(originalFrom = originalFrom + offset, originalTo = originalTo + offset)
|
|
|
|
def shiftUpdated(offset: Offset.CodePoint): TextReplacement =
|
|
copy(updatedFrom = updatedFrom + offset, updatedTo = updatedTo + offset)
|
|
|
|
def shift(offset: Offset.CodePoint): TextReplacement =
|
|
TextReplacement(
|
|
originalFrom + offset,
|
|
originalTo + offset,
|
|
updatedFrom + offset,
|
|
updatedTo + offset
|
|
)
|
|
|
|
def inverse: TextReplacement =
|
|
TextReplacement(
|
|
originalFrom = updatedFrom,
|
|
originalTo = updatedTo,
|
|
updatedFrom = originalFrom,
|
|
updatedTo = originalTo
|
|
)
|
|
}
|