funkwhale/api/funkwhale_api/common/search.py

import re

from django.contrib.postgres.search import SearchQuery
from django.db.models import Q

from . import utils

QUERY_REGEX = re.compile(r'(((?P<key>\w+):)?(?P<value>"[^"]+"|[\S]+))')


def parse_query(query):
    """
    Given a search query such as "hello is:issue status:opened",
    returns a list of dictionaries describing each query token
    """
    matches = [m.groupdict() for m in QUERY_REGEX.finditer(query.lower())]
    for m in matches:
        if m["value"].startswith('"') and m["value"].endswith('"'):
            m["value"] = m["value"][1:-1]
    return matches


def normalize_query(
    query_string,
    findterms=re.compile(r'"([^"]+)"|(\S+)').findall,
    normspace=re.compile(r"\s{2,}").sub,
):
    """Splits the query string in individual keywords, getting rid of unnecessary spaces
    and grouping quoted words together.
    Example:

    >>> normalize_query('  some random  words "with   quotes  " and   spaces')
    ['some', 'random', 'words', 'with quotes', 'and', 'spaces']

    """
    return [normspace(" ", (t[0] or t[1]).strip()) for t in findterms(query_string)]


def get_query(query_string, search_fields):
    """Returns a query, that is a combination of Q objects. That combination
    aims to search keywords within a model by testing the given search fields.

    """
    query = None  # Query to search for every search term
    terms = normalize_query(query_string)
    for term in terms:
        or_query = None  # Query to search for a given term in each field
        for field_name in search_fields:
            q = Q(**{"%s__icontains" % field_name: term})
            if or_query is None:
                or_query = q
            else:
                or_query = or_query | q
        if query is None:
            query = or_query
        else:
            query = query & or_query
    return query


def remove_chars(string, chars):
    for char in chars:
        string = string.replace(char, "")
    return string


def get_fts_query(query_string, fts_fields=["body_text"], model=None):
    search_type = "raw"
    if query_string.startswith('"') and query_string.endswith('"'):
        # we pass the query directly to the FTS engine
        query_string = query_string[1:-1]
    else:
        query_string = remove_chars(query_string, ['"', "&", "(", ")", "!", "'"])
        parts = query_string.replace(":", "").split(" ")
        parts = [f"{p}:*" for p in parts if p]
        if not parts:
            return Q(pk=None)

        query_string = "&".join(parts)

    if not fts_fields or not query_string.strip():
        return Q(pk=None)
    query = None
    for field in fts_fields:
        if "__" in field and model:
            # When we have a nested lookup, we switch to a subquery for enhanced performance
            fk_field_name, lookup = (
                field.split("__")[0],
                "__".join(field.split("__")[1:]),
            )
            fk_field = model._meta.get_field(fk_field_name)
            related_model = fk_field.related_model
            subquery = related_model.objects.filter(
                **{
                    lookup: SearchQuery(
                        query_string, search_type=search_type, config="english_nostop"
                    )
                }
            ).values_list("pk", flat=True)
            new_query = Q(**{f"{fk_field_name}__in": list(subquery)})
        else:
            new_query = Q(
                **{
                    field: SearchQuery(
                        query_string, search_type=search_type, config="english_nostop"
                    )
                }
            )
        query = utils.join_queries_or(query, new_query)

    return query


def filter_tokens(tokens, valid):
    return [t for t in tokens if t["key"] in valid]


def apply(qs, config_data):
    for k in ["filter_query", "search_query"]:
        q = config_data.get(k)
        if q:
            qs = qs.filter(q)
    distinct = config_data.get("distinct", False)
    if distinct:
        qs = qs.distinct()
    return qs


class SearchConfig:
    def __init__(self, search_fields={}, filter_fields={}, types=[]):
        self.filter_fields = filter_fields
        self.search_fields = search_fields
        self.types = types

    def clean(self, query):
        tokens = parse_query(query)
        cleaned_data = {}
        cleaned_data["types"] = self.clean_types(filter_tokens(tokens, ["is"]))
        cleaned_data["search_query"] = self.clean_search_query(
            filter_tokens(tokens, [None, "in"] + list(self.search_fields.keys()))
        )
        unhandled_tokens = [
            t
            for t in tokens
            if t["key"] not in [None, "is", "in"] + list(self.search_fields.keys())
        ]
        cleaned_data["filter_query"], matching_filters = self.clean_filter_query(
            unhandled_tokens
        )
        if matching_filters:
            cleaned_data["distinct"] = any(
                [
                    self.filter_fields[k].get("distinct", False)
                    for k in matching_filters
                    if k in self.filter_fields
                ]
            )
        else:
            cleaned_data["distinct"] = False
        return cleaned_data

    def clean_search_query(self, tokens):
        if not self.search_fields or not tokens:
            return

        fields_subset = {
            f for t in filter_tokens(tokens, ["in"]) for f in t["value"].split(",")
        } or set(self.search_fields.keys())
        fields_subset = set(self.search_fields.keys()) & fields_subset
        to_fields = [self.search_fields[k]["to"] for k in fields_subset]

        specific_field_query = None
        for token in tokens:
            if token["key"] not in self.search_fields:
                continue
            to = self.search_fields[token["key"]]["to"]
            try:
                field = token["field"]
                value = field.clean(token["value"])
            except KeyError:
                # no cleaning to apply
                value = token["value"]
            q = Q(**{f"{to}__icontains": value})
            if not specific_field_query:
                specific_field_query = q
            else:
                specific_field_query &= q
        query_string = " ".join([t["value"] for t in filter_tokens(tokens, [None])])
        unhandled_tokens_query = get_query(query_string, sorted(to_fields))

        if specific_field_query and unhandled_tokens_query:
            return unhandled_tokens_query & specific_field_query
        elif specific_field_query:
            return specific_field_query
        elif unhandled_tokens_query:
            return unhandled_tokens_query
        return None

    def clean_filter_query(self, tokens):
        if not self.filter_fields or not tokens:
            return None, []

        matching = [t for t in tokens if t["key"] in self.filter_fields]
        queries = [self.get_filter_query(token) for token in matching]
        query = None
        for q in queries:
            if not query:
                query = q
            else:
                query = query & q
        return query, [m["key"] for m in matching]

    def get_filter_query(self, token):
        raw_value = token["value"]
        try:
            field = self.filter_fields[token["key"]]["field"]
            value = field.clean(raw_value)
        except KeyError:
            # no cleaning to apply
            value = raw_value
        try:
            query_field = self.filter_fields[token["key"]]["to"]
            return Q(**{query_field: value})
        except KeyError:
            pass

        # we don't have a basic filter -> field mapping, this likely means we
        # have a dynamic handler in the config
        handler = self.filter_fields[token["key"]]["handler"]
        value = handler(value)
        return value

    def clean_types(self, tokens):
        if not self.types:
            return []

        if not tokens:
            # no filtering on type, we return all types
            return [t for key, t in self.types]
        types = []
        for token in tokens:
            for key, t in self.types:
                if key.lower() == token["value"]:
                    types.append(t)

        return types