Skip to content
57 changes: 57 additions & 0 deletions core/common/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,63 @@

from core.common.constants import ES_REQUEST_TIMEOUT
from core.common.utils import is_url_encoded_string
from core.orgs.constants import ORG_OBJECT_TYPE
from core.users.constants import USER_OBJECT_TYPE


def get_document_public_visibility_criteria(
user,
include_creator_private_access=False,
include_owner_private_access=False,
include_organization_memberships=False,
):
"""Return a shared Elasticsearch visibility criterion for owner-scoped documents."""
criteria = Q('term', public_can_view=True)
if not getattr(user, 'is_authenticated', False):
return criteria

private_criteria = None
username = getattr(user, 'username', None)
if username and include_creator_private_access:
private_criteria = Q('term', created_by=username)

if username and include_owner_private_access:
owner_criteria = Q('term', owner_type=USER_OBJECT_TYPE) & Q('term', owner=username.lower())
private_criteria = owner_criteria if private_criteria is None else private_criteria | owner_criteria

if include_organization_memberships:
organization_mnemonics = [
mnemonic.lower() for mnemonic in user.organizations.values_list('mnemonic', flat=True)
]
if organization_mnemonics:
org_criteria = Q('term', owner_type=ORG_OBJECT_TYPE) & Q('terms', owner=organization_mnemonics)
private_criteria = org_criteria if private_criteria is None else private_criteria | org_criteria

if private_criteria is None:
return criteria

return criteria | (Q('term', public_can_view=False) & private_criteria)


def apply_document_public_visibility_filter(
search,
user,
include_creator_private_access=False,
include_owner_private_access=False,
include_organization_memberships=False,
):
"""Apply a shared Elasticsearch visibility filter without changing staff searches."""
if getattr(user, 'is_staff', False):
return search

return search.filter(
get_document_public_visibility_criteria(
user,
include_creator_private_access=include_creator_private_access,
include_owner_private_access=include_owner_private_access,
include_organization_memberships=include_organization_memberships,
)
)


class CustomESFacetedSearch(FacetedSearch):
Expand Down
74 changes: 41 additions & 33 deletions core/common/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,20 @@
CANONICAL_URL_REQUEST_PARAM, CHECKSUMS_PARAM, ACCESS_TYPE_NONE
from core.common.exceptions import Http400
from core.common.mixins import PathWalkerMixin
from core.common.search import CustomESSearch
from core.common.search import CustomESSearch, get_document_public_visibility_criteria
from core.common.serializers import RootSerializer
from core.common.swagger_parameters import all_resource_query_param
from core.common.throttling import ThrottleUtil
from core.common.utils import compact_dict_by_values, to_snake_case, parse_updated_since_param, \
to_int, get_falsy_values, get_truthy_values, format_url_for_search
from core.concepts.search import (
get_concept_exact_search_criterion,
get_concept_fuzzy_search_criterion,
get_concept_mandatory_exclude_words_criteria,
get_concept_mandatory_words_criteria,
get_concept_search_rescore,
get_concept_wildcard_search_criterion,
)
from core.concepts.permissions import CanViewParentDictionary, CanEditParentDictionary
from core.orgs.constants import ORG_OBJECT_TYPE
from core.users.constants import USER_OBJECT_TYPE
Expand Down Expand Up @@ -292,6 +300,12 @@ def get_sort_attributes(self):
return result

def get_fuzzy_search_criterion(self, boost_divide_by=10, expansions=5):
if self.is_concept_document():
return get_concept_fuzzy_search_criterion(
self.get_raw_search_string(),
boost_divide_by=boost_divide_by,
expansions=expansions,
)
return CustomESSearch.get_fuzzy_match_criterion(
search_str=self.get_search_string(decode=False),
fields=self.get_fuzzy_search_fields(),
Expand All @@ -300,13 +314,23 @@ def get_fuzzy_search_criterion(self, boost_divide_by=10, expansions=5):
)

def get_wildcard_search_criterion(self, search_str=None):
if self.is_concept_document():
return get_concept_wildcard_search_criterion(
search_str or self.get_raw_search_string(),
include_map_codes=self.request.query_params.get(SEARCH_MAP_CODES_PARAM) not in get_falsy_values(),
)
fields = self.get_wildcard_search_fields()
return CustomESSearch.get_wildcard_match_criterion(
search_str=search_str or self.get_search_string(),
fields=fields
), fields.keys()

def get_exact_search_criterion(self):
if self.is_concept_document():
return get_concept_exact_search_criterion(
self.get_raw_search_string(),
include_map_codes=self.request.query_params.get(SEARCH_MAP_CODES_PARAM) not in get_falsy_values(),
)
match_phrase_field_list = self.document_model.get_match_phrase_attrs()
match_word_fields_map = self.clean_fields(self.document_model.get_exact_match_attrs())
fields = match_phrase_field_list + list(match_word_fields_map.keys())
Expand Down Expand Up @@ -662,16 +686,19 @@ def is_user_scope(self):
return False

def get_public_criteria(self):
criteria = Q('term', public_can_view=True)
user = self.request.user
criteria = Q('term', public_can_view=True)

if user.is_authenticated:
username = user.username
from core.orgs.documents import OrganizationDocument
if self.document_model in [OrganizationDocument]:
criteria |= (Q('term', public_can_view=False) & Q('term', user=username))
if self.is_concept_container_document_model() or self.is_source_child_document_model():
criteria |= (Q('term', public_can_view=False) & Q('term', created_by=username))
return get_document_public_visibility_criteria(
user,
include_creator_private_access=True,
)

return criteria

Expand Down Expand Up @@ -884,49 +911,30 @@ def __get_search_results(self, ignore_retired_filter=False, sort=True, highlight

sort_attrs = self._get_sort_attribute()
if self.is_concept_document() and (not sort_attrs or '_score' in get(sort_attrs, '0', {})):
search_str = self.get_search_string(lower=False)
results = results.extra(
rescore={
"window_size": 400,
"query": {
"score_mode": "total",
"query_weight": 1.0,
"rescore_query_weight": 800.0,
"rescore_query": {
"dis_max": {
"tie_breaker": 0.0,
"queries": [
{
"constant_score": {
"filter": { "term": { "_name": { "value": search_str, "case_insensitive": True } } },
"boost": 10.0
}
},
{
"constant_score": {
"filter": { "term": { "_synonyms": { "value": search_str, "case_insensitive": True } } },
"boost": 8.0
}
}
]
}
}
}
}
)
results = results.extra(rescore=get_concept_search_rescore(self.get_raw_search_string()))
if fields and highlight and self.request.query_params.get(INCLUDE_SEARCH_META_PARAM) in get_truthy_values():
results = results.highlight(*self.clean_fields_for_highlight(fields))
results = results.source(excludes=['_synonyms_embeddings', '_embeddings'])
return results.sort(*sort_attrs) if sort else results

def get_mandatory_words_criteria(self):
if self.is_concept_document():
return get_concept_mandatory_words_criteria(
self.get_raw_search_string(),
include_map_codes=self.request.query_params.get(SEARCH_MAP_CODES_PARAM) not in get_falsy_values(),
)
criterion = None
for must_have in CustomESSearch.get_must_haves(self.get_raw_search_string()):
criteria, _ = self.get_wildcard_search_criterion(f"{must_have}*")
criterion = criteria if criterion is None else criterion & criteria
return criterion

def get_mandatory_exclude_words_criteria(self):
if self.is_concept_document():
return get_concept_mandatory_exclude_words_criteria(
self.get_raw_search_string(),
include_map_codes=self.request.query_params.get(SEARCH_MAP_CODES_PARAM) not in get_falsy_values(),
)
criterion = None
for must_not_have in CustomESSearch.get_must_not_haves(self.get_raw_search_string()):
criteria, _ = self.get_wildcard_search_criterion(f"{must_not_have}*")
Expand Down
184 changes: 184 additions & 0 deletions core/concepts/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,192 @@
from core.common.constants import FACET_SIZE, HEAD
from core.common.search import CustomESFacetedSearch, CustomESSearch
from core.common.utils import get_embeddings, is_canonical_uri
from core.concepts.documents import ConceptDocument
from core.concepts.models import Concept

CONCEPT_FUZZY_BOOST_DIVIDE_BY = 10000
CONCEPT_FUZZY_EXPANSIONS = 2


def normalize_concept_search_query(query):
"""Normalize raw concept search text so all callers share the same preprocessing."""
return str(query or '').replace('"', '').replace("'", '').strip()


def filter_concept_search_fields(fields, include_map_codes=True):
"""Optionally remove map-code fields to match the REST search toggle semantics."""
if include_map_codes:
return fields

if isinstance(fields, dict):
return {key: value for key, value in fields.items() if not key.endswith('map_codes')}

return [field for field in fields if not field.endswith('map_codes')]


def get_concept_search_string(query, lower=True, decode=True):
"""Return the normalized concept query string in the same format used by REST search."""
return CustomESSearch.get_search_string(
normalize_concept_search_query(query),
lower=lower,
decode=decode,
)


def get_concept_exact_search_criterion(query, include_map_codes=True):
"""Build the exact-match clause used by both REST and GraphQL concept search."""
match_phrase_field_list = ConceptDocument.get_match_phrase_attrs()
match_word_fields_map = filter_concept_search_fields(
ConceptDocument.get_exact_match_attrs(),
include_map_codes=include_map_codes,
)
fields = match_phrase_field_list + list(match_word_fields_map.keys())
return CustomESSearch.get_exact_match_criterion(
get_concept_search_string(query, lower=False, decode=False),
match_phrase_field_list,
match_word_fields_map,
), fields


def get_concept_wildcard_search_criterion(query, include_map_codes=True):
"""Build the wildcard clause used by both REST and GraphQL concept search."""
fields = filter_concept_search_fields(
ConceptDocument.get_wildcard_search_attrs(),
include_map_codes=include_map_codes,
)
return CustomESSearch.get_wildcard_match_criterion(
search_str=get_concept_search_string(query),
fields=fields,
), list(fields.keys())


def get_concept_fuzzy_search_criterion(
query,
boost_divide_by=CONCEPT_FUZZY_BOOST_DIVIDE_BY,
expansions=CONCEPT_FUZZY_EXPANSIONS,
):
"""Build the fuzzy clause used by both REST and GraphQL concept search."""
return CustomESSearch.get_fuzzy_match_criterion(
search_str=get_concept_search_string(query, decode=False),
fields=ConceptDocument.get_fuzzy_search_attrs(),
boost_divide_by=boost_divide_by,
expansions=expansions,
)


def get_concept_mandatory_words_criteria(query, include_map_codes=True):
"""Build the required-word wildcard clauses shared by REST and GraphQL."""
criterion = None
for must_have in CustomESSearch.get_must_haves(normalize_concept_search_query(query)):
criteria, _ = get_concept_wildcard_search_criterion(
f"{must_have}*",
include_map_codes=include_map_codes,
)
criterion = criteria if criterion is None else criterion & criteria
return criterion


def get_concept_mandatory_exclude_words_criteria(query, include_map_codes=True):
"""Build the excluded-word wildcard clauses shared by REST and GraphQL."""
criterion = None
for must_not_have in CustomESSearch.get_must_not_haves(normalize_concept_search_query(query)):
criteria, _ = get_concept_wildcard_search_criterion(
f"{must_not_have}*",
include_map_codes=include_map_codes,
)
criterion = criteria if criterion is None else criterion | criteria
return criterion


def get_concept_search_rescore(query):
"""Return the concept-specific ES rescore block shared by REST and GraphQL."""
search_str = get_concept_search_string(query, lower=False)
return {
"window_size": 400,
"query": {
"score_mode": "total",
"query_weight": 1.0,
"rescore_query_weight": 800.0,
"rescore_query": {
"dis_max": {
"tie_breaker": 0.0,
"queries": [
{
"constant_score": {
"filter": {
"term": {
"_name": {
"value": search_str,
"case_insensitive": True,
}
}
},
"boost": 10.0,
}
},
{
"constant_score": {
"filter": {
"term": {
"_synonyms": {
"value": search_str,
"case_insensitive": True,
}
}
},
"boost": 8.0,
}
},
]
}
},
},
}


def apply_concept_text_search(
search,
query,
include_wildcard=True,
include_fuzzy=True,
include_map_codes=True,
fuzzy_boost_divide_by=CONCEPT_FUZZY_BOOST_DIVIDE_BY,
fuzzy_expansions=CONCEPT_FUZZY_EXPANSIONS,
include_rescore=False,
):
"""Apply the shared concept text-search clauses to an Elasticsearch search object."""
criterion, fields = get_concept_exact_search_criterion(query, include_map_codes=include_map_codes)

if include_wildcard:
wildcard_criterion, wildcard_fields = get_concept_wildcard_search_criterion(
query,
include_map_codes=include_map_codes,
)
criterion |= wildcard_criterion
fields += wildcard_fields

if include_fuzzy:
criterion |= get_concept_fuzzy_search_criterion(
query,
boost_divide_by=fuzzy_boost_divide_by,
expansions=fuzzy_expansions,
)

search = search.query(criterion)

must_have_criterion = get_concept_mandatory_words_criteria(query, include_map_codes=include_map_codes)
if must_have_criterion is not None:
search = search.filter(must_have_criterion)

must_not_criterion = get_concept_mandatory_exclude_words_criteria(query, include_map_codes=include_map_codes)
if must_not_criterion is not None:
search = search.filter(~must_not_criterion)

if include_rescore:
search = search.extra(rescore=get_concept_search_rescore(query))

return search, fields


class ConceptFacetedSearch(CustomESFacetedSearch):
index = 'concepts'
Expand Down
Loading