Source code for kadi.modules.records.utils

# Copyright 2022 Karlsruhe Institute of Technology
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datetime import timedelta

from elasticsearch_dsl import Q as Query
from flask import current_app
from flask_login import current_user

from .files import remove_file
from .files import remove_temporary_file
from .models import File
from .models import FileState
from .models import Record
from .models import TemporaryFile
from .models import Upload
from .models import UploadState
from .uploads import remove_upload
from kadi.ext.db import db
from kadi.lib.permissions.core import get_permitted_objects
from kadi.lib.resources.utils import get_filtered_resources
from kadi.lib.resources.utils import search_resources
from kadi.lib.tags.models import Tag
from kadi.lib.utils import is_quoted
from kadi.lib.utils import utcnow
from kadi.modules.collections.models import Collection
from kadi.modules.collections.models import CollectionState
from kadi.modules.collections.utils import get_child_collections


[docs]def search_records( search_query=None, page=1, per_page=10, sort="_score", visibility=None, user_ids=None, collection_ids=None, child_collections=False, record_types=None, tags=None, tag_operator="or", mimetypes=None, extra_queries=None, user=None, ): """Search for and filter records. Uses :func:`kadi.lib.resources.utils.get_filtered_resources` and :func:`kadi.lib.resources.utils.search_resources`. :param search_query: (optional) See :func:`kadi.lib.resources.utils.search_resources`. :param page: (optional) See :func:`kadi.lib.resources.utils.search_resources`. :param per_page: (optional) See :func:`kadi.lib.resources.utils.search_resources`. :param sort: (optional) See :func:`kadi.lib.resources.utils.search_resources`. :param visibility: (optional) See :func:`kadi.lib.resources.utils.get_filtered_resources`. :param user_ids: (optional) See :func:`kadi.lib.resources.utils.get_filtered_resources`. :param collection_ids: (optional) A list of collection IDs the searched records need to belong to. :param child_collections: (optional) Flag indicating whether the records of the children of the given collection IDs should be included. :param record_types: (optional) A list of record types to filter the records with. :param tags: (optional) A list of tag names to filter the records with. :param tag_operator: (optional) The operator to filter the tags with. One of ``"or"`` or ``"and"``. :param mimetypes: (optional) A list of MIME types to filter the records with based on their files :param extra_queries: (optional) A list of dictionaries to specifiy search queries within the extra metadata of records. Each query can contain a link type, a key, a type and one or multiple values depending on the type. See also :attr:`.Record.extras`. **Example:** .. code-block:: python3 [ { # The link type, one of "and" or "or". Note that the link type of # the first query can also be left out. "link": "and", # The key of the metadatum. Supports exact matches when surrounded # by double quotes. "key": "sample key", # The type of the metadatum, one of "str", "numeric" (for integer # and float values), "bool" or "date". "type": "str", # The string value of the metadatum if the type is "str". Supports # exact matches when surrounded by double quotes. "str": "string", # Restrictions about the numeric value of the metadatum if the type # is "numeric". Either a minimum value ("min"), a maximum value # ("max") or both can be specified. Note that the range is always # inclusive. Units ("unit") can optionally be specified as well and # are always matched exactly. "numeric": {"min": 0, "max": 1, "unit": "cm"}, # The boolean value of the metadatum if the type is "bool", one of # True, "true", False or "false". "bool": True, # Restrictions about the date value of the metadatum if the type is # "date". Either a minimum value ("min"), a maximum value ("max") or # both can be specified as a formatted date string. Note that the # range is always inclusive. "date": { "min": "2020-07-01T00:00:00.000Z", "max": "2020-07-02T00:00:00.000Z", }, }, ] :param user: (optional) The user to check for any permissions regarding the searched records. Defaults to the current user. :return: The search results as returned by :func:`kadi.lib.resources.utils.search_resources`. """ user = user if user is not None else current_user records_query = get_filtered_resources( Record, visibility=visibility, user_ids=user_ids, user=user ) if collection_ids: collections_query = get_permitted_objects(user, "read", "collection").filter( Collection.state == CollectionState.ACTIVE, Collection.id.in_(collection_ids), ) if child_collections: child_collection_ids = [] for collection in collections_query: child_collection_ids += [ c.id for c in get_child_collections(collection) ] collections_query = collections_query.union( Collection.query.filter(Collection.id.in_(child_collection_ids)) ) records_query = records_query.join(Record.collections).filter( Collection.id.in_(collections_query.with_entities(Collection.id)) ) if record_types: records_query = records_query.filter(Record.type.in_(record_types)) if tags: if tag_operator == "and": tag_filters = [] for tag in tags: tag_filters.append(Record.tags.any(Tag.name == tag)) records_query = records_query.filter(*tag_filters) else: # Always fall back to "or" otherwise. records_query = records_query.join(Record.tags).filter(Tag.name.in_(tags)) if mimetypes: records_query = records_query.join(File).filter( File.state == FileState.ACTIVE, File.mimetype.in_(mimetypes) ) record_ids = [r.id for r in records_query.with_entities(Record.id)] # Construct an Elasticsearch DSL query object for the extra queries, if applicable. extra_es_query = None if extra_queries: or_relations = [] and_relations = [] # Multiple queries with different link types are effectively combined as: # (Q1 AND Q2) OR (Q3 AND Q4). Note that the first link type does not actually # matter. for extra_query in extra_queries: es_query = _extras_dict_to_query(extra_query) if es_query: if extra_query.get("link") == "or": if and_relations: or_relations.append(Query("bool", must=and_relations)) and_relations = [es_query] else: and_relations.append(es_query) or_relations.append(Query("bool", must=and_relations)) extra_es_query = Query("bool", should=or_relations) return search_resources( Record, search_query=search_query, page=page, per_page=per_page, sort=sort, filter_ids=record_ids, extra_es_query=extra_es_query, )
def _make_extra_key_query(extra_type, extra_key, nested=False): should_query = [] if is_quoted(extra_key): extra_key = extra_key[1:-1] else: match_query = Query( "match", **{f"extras_{extra_type}.key": {"query": extra_key, "fuzziness": "AUTO"}}, ) if not nested: should_query.append(match_query) else: should_query.append( Query("nested", path=f"extras_{extra_type}", query=match_query) ) # Always run a term query as well that is weighted more heavily. term_query = Query( "term", **{f"extras_{extra_type}.key.keyword": {"value": extra_key, "boost": 5}} ) if not nested: should_query.append(term_query) else: should_query.append( Query("nested", path=f"extras_{extra_type}", query=term_query) ) return Query("bool", should=should_query) def _extras_dict_to_query(query_dict): extra_type = str(query_dict.get("type", "")) extra_key = str(query_dict.get("key", "")) # Build a query for a single string value and its key. if extra_type == "str": str_query = [] str_value = str(query_dict.get("str", "")) if str_value: should_query = [] if is_quoted(str_value): str_value = str_value[1:-1] else: should_query.append( Query( "match", extras_str__value={"query": str_value, "fuzziness": "AUTO"}, ) ) # Always run a term query as well that is weighted more heavily. should_query.append( Query( "term", extras_str__value__keyword={"value": str_value, "boost": 5} ) ) str_query.append(Query("bool", should=should_query)) if extra_key: str_query.append(_make_extra_key_query("str", extra_key)) return Query("nested", path="extras_str", query=Query("bool", must=str_query)) # Build a query for a single numeric value and its key. if extra_type == "numeric": int_query = [] float_query = [] numeric_dict = query_dict.get("numeric") if not isinstance(numeric_dict, dict): numeric_dict = {} min_value = str(numeric_dict.get("min", "")) max_value = str(numeric_dict.get("max", "")) unit_value = str(numeric_dict.get("unit", "")) if min_value: int_query.append(Query("range", extras_int__value={"gte": min_value})) float_query.append(Query("range", extras_float__value={"gte": min_value})) if max_value: int_query.append(Query("range", extras_int__value={"lte": max_value})) float_query.append(Query("range", extras_float__value={"lte": max_value})) if unit_value: int_query.append(Query("match", extras_int__unit=unit_value)) float_query.append(Query("match", extras_float__unit=unit_value)) if extra_key: int_query.append(_make_extra_key_query("int", extra_key)) float_query.append(_make_extra_key_query("float", extra_key)) return Query( "bool", should=[ Query("nested", path="extras_int", query=Query("bool", must=int_query)), Query( "nested", path="extras_float", query=Query("bool", must=float_query) ), ], ) # Build a query for a single bool value and its key. if extra_type == "bool": bool_query = [] bool_value = str(query_dict.get("bool", "")) if bool_value.lower() == "true": bool_query.append(Query("term", extras_bool__value=True)) elif bool_value.lower() == "false": bool_query.append(Query("term", extras_bool__value=False)) if extra_key: bool_query.append(_make_extra_key_query("bool", extra_key)) return Query("nested", path="extras_bool", query=Query("bool", must=bool_query)) # Build a query for a single date value and its key. if extra_type == "date": date_query = [] date_dict = query_dict.get("date") if not isinstance(date_dict, dict): date_dict = {} min_value = str(date_dict.get("min", "")) max_value = str(date_dict.get("max", "")) if min_value: date_query.append(Query("range", extras_date__value={"gte": min_value})) if max_value: date_query.append(Query("range", extras_date__value={"lte": max_value})) if extra_key: date_query.append(_make_extra_key_query("date", extra_key)) return Query("nested", path="extras_date", query=Query("bool", must=date_query)) # Build a query for a key of any type. if extra_key: return Query( "bool", should=[ _make_extra_key_query(extra_type, extra_key, nested=True) for extra_type in ["str", "int", "float", "bool", "date"] ], ) return None
[docs]def clean_files(inside_task=False): """Clean all expired and/or inactive files, uploads and temporary files. Note that this function may issue one or more database commits. :param inside_task: (optional) A flag indicating whether the function is executed in a task. In that case, additional information will be logged. """ # Remove expired inactive files. expiration_date = utcnow() - timedelta( seconds=current_app.config["INACTIVE_FILES_MAX_AGE"] ) files = File.query.filter( File.state == FileState.INACTIVE, File.last_modified < expiration_date ) if inside_task and files.count() > 0: current_app.logger.info(f"Cleaning {files.count()} inactive file(s).") for file in files: remove_file(file, delete_from_db=False) # Remove expired and inactive uploads. active_expiration_date = utcnow() - timedelta( seconds=current_app.config["UPLOADS_MAX_AGE"] ) # Leave inactive uploads intact for at least a small amount of time, so their status # can always be queried in case the upload required a task for processing it and # something went wrong. inactive_expiration_date = utcnow() - timedelta(minutes=5) uploads = Upload.query.filter( db.or_( db.and_( Upload.state == UploadState.ACTIVE, Upload.last_modified < active_expiration_date, ), db.and_( Upload.state == UploadState.INACTIVE, Upload.last_modified < inactive_expiration_date, ), ) ) if inside_task and uploads.count() > 0: current_app.logger.info( f"Cleaning {uploads.count()} expired or inactive upload(s)." ) for upload in uploads: remove_upload(upload) # Remove expired temporary files. expiration_date = utcnow() - timedelta( seconds=current_app.config["TEMPORARY_FILES_MAX_AGE"] ) temporary_files = TemporaryFile.query.filter( TemporaryFile.last_modified < expiration_date ) if inside_task and temporary_files.count() > 0: current_app.logger.info( f"Cleaning {temporary_files.count()} temporary file(s)." ) for temporary_file in temporary_files: remove_temporary_file(temporary_file)