Source code for capitains_nautilus.cts.resolver

from glob import glob
import os.path
import logging
from werkzeug.contrib.cache import NullCache

import MyCapytain.errors
from MyCapytain.common.reference import URN, Reference
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver
from MyCapytain.resolvers.utils import CollectionDispatcher
from MyCapytain.resources.collections.cts import (
    XmlCtsTextInventoryMetadata as TextInventory,
    XmlCtsTextgroupMetadata as TextGroup,
    XmlCtsWorkMetadata as Work,
    XmlCtsCitation as Citation,
    XmlCtsEditionMetadata as Edition
)
from MyCapytain.resources.prototypes.cts.inventory import CtsTextInventoryCollection as TextInventoryCollection
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText as Text
from MyCapytain.common.constants import set_graph

from capitains_nautilus import _cache_key
from capitains_nautilus.errors import *


[docs]class NautilusCTSResolver(CtsCapitainsLocalResolver): """ XML Folder Based resolver. :param resource: Resource should be a list of folders retaining data as Capitains Guidelines Repositories :type resource: [str] :param name: Key used to make cache key :param cache: Cache object to be used for the inventory :type cache: BaseCache :param logger: Logging object :type logger: logging.logger :ivar inventory_cache_key: Werkzeug Cache key to get or set cache for the TextInventory :ivar texts_cache_key: Werkzeug Cache key to get or set cache for lists of metadata texts objects :ivar texts_parsed: Werkzeug Cache key to get or set cache for lists of parsed texts objects :ivar texts: List of Text Metadata objects :ivar source: Original resource parameter .. warning :: This resolver does not support inventories """ TIMEOUT = 0 NautilusCTSResolver = False REMOVE_EMPTY = True CACHE_FULL_TEI = False def __init__(self, resource, name=None, logger=None, cache=None, dispatcher=None): """ Initiate the XMLResolver """ if dispatcher is None: inventory_collection = TextInventoryCollection(identifier="defaultTic") ti = TextInventory("default") ti.parent = inventory_collection ti.set_label("Default collection", "eng") self.dispatcher = CollectionDispatcher(inventory_collection) else: self.dispatcher = dispatcher self.__inventory__ = None self.__texts__ = [] self.name = name self.logger = logger if not logger: self.logger = logging.getLogger(name) if not name: self.name = "repository" if cache is None: cache = NullCache() self.__cache__ = cache self.__resources__ = resource self.inventory_cache_key = _cache_key("Nautilus", self.name, "Inventory", "Resources") self.texts_parsed_cache_key = _cache_key("Nautilus", self.name, "Inventory", "TextsParsed") @property def cache(self): return self.__cache__ @property def inventory(self): if self.__inventory__ is None or len(self.__inventory__.readableDescendants) == 0: self.__inventory__ = self.get_or(self.inventory_cache_key, self.parse, self.__resources__) set_graph(self.__inventory__.graph) return self.__inventory__ @inventory.setter def inventory(self, value): self.__inventory__ = value self.cache.set(self.inventory_cache_key, value, self.TIMEOUT) @property def texts(self): """ List of text known :rtype: list """ return self.inventory.readableDescendants def xmlparse(self, file): """ Parse a XML file :param file: Opened File :return: Tree """ if self.CACHE_FULL_TEI is True: return self.get_or( _cache_key("Nautilus", self.name, "File", "Tree", file.name), super(NautilusCTSResolver, self).xmlparse, file ) return super(NautilusCTSResolver, self).xmlparse(file) def get_or(self, cache_key, callback, *args, **kwargs): """ Get or set the cache using callback and arguments :param cache_key: Cache key for given resource :param callback: Callback if object does not exist :param args: Ordered Argument for the callback :param kwargs: Keyword argument for the callback :return: Output of the callback """ cached = self.cache.get(cache_key) if cached is not None: return cached else: try: output = callback(*args, **kwargs) except MyCapytain.errors.UnknownCollection as E: raise UnknownCollection(str(E)) except Exception as E: raise E self.cache.set(cache_key, output, self.TIMEOUT) return output def read(self, identifier, path=None): """ Read a text object given an identifier and a path :param identifier: Identifier of the text :param path: Path of the text files :return: Text """ if self.CACHE_FULL_TEI is True: o = self.cache.get(_cache_key(self.texts_parsed_cache_key, identifier)) if o is not None: return o else: with open(path) as f: o = Text(urn=identifier, resource=self.xmlparse(f)) self.cache.set(_cache_key(self.texts_parsed_cache_key, identifier), o) else: with open(path) as f: o = Text(urn=identifier, resource=self.xmlparse(f)) return o def parse(self, resource=None): """ Parse a list of directories ans :param resource: List of folders :param ret: Return a specific item ("inventory" or "texts") """ if resource is None: resource = self.__resources__ removing = [] for folder in resource: textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder)) for __cts__ in textgroups: try: with open(__cts__) as __xml__: textgroup = TextGroup.parse( resource=__xml__ ) tg_urn = str(textgroup.urn) if tg_urn in self.dispatcher.collection: self.dispatcher.collection[tg_urn].update(textgroup) else: self.dispatcher.dispatch(textgroup, path=__cts__) for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))): with open(__subcts__) as __xml__: work = Work.parse( resource=__xml__, parent=self.dispatcher.collection[tg_urn] ) work_urn = str(work.urn) if work_urn in self.dispatcher.collection[tg_urn].works: self.dispatcher.collection[work_urn].update(work) for __textkey__ in work.texts: __text__ = self.dispatcher.collection[__textkey__] __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( directory=os.path.dirname(__subcts__), textgroup=__text__.urn.textgroup, work=__text__.urn.work, version=__text__.urn.version ) if os.path.isfile(__text__.path): try: t = self.read(__textkey__, __text__.path) cites = list() for cite in [c for c in t.citation][::-1]: if len(cites) >= 1: cites.append(Citation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name, child=cites[-1] )) else: cites.append(Citation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name )) del t __text__.citation = cites[-1] self.logger.info("%s has been parsed ", __text__.path) if __text__.citation.isEmpty() is True: removing.append(__textkey__) self.logger.error("%s has no passages", __text__.path) except Exception as E: removing.append(__textkey__) self.logger.error( "%s does not accept parsing at some level (most probably citation) ", __text__.path ) else: removing.append(__textkey__) self.logger.error("%s is not present", __text__.path) except MyCapytain.errors.UndispatchedTextError as E: self.logger.error("Error dispatching %s ", __cts__) if self.RAISE_ON_UNDISPATCHED is True: raise UndispatchedTextError(E) except Exception as E: self.logger.error("Error parsing %s ", __cts__) for removable in removing: del self.dispatcher.collection[removable] removing = [] if self.REMOVE_EMPTY is True: # Find resource with no readable descendants for item in self.dispatcher.collection.descendants: if item.readable != True and len(item.readableDescendants) == 0: removing.append(item.id) # Remove them only if they have not been removed before for removable in removing: if removable in self.dispatcher.collection: del self.dispatcher.collection[removable] self.inventory = self.dispatcher.collection return self.inventory def __getText__(self, urn): """ Returns a PrototypeText object :param urn: URN of a text to retrieve :type urn: str, URN :return: Textual resource and metadata :rtype: (Text, InventoryText) """ if not isinstance(urn, URN): urn = URN(urn) if len(urn) != 5: if len(urn) == 4: urn, reference = urn.upTo(URN.WORK), str(urn.reference) urn = [ t.id for t in self.texts if t.id.startswith(str(urn)) and isinstance(t, Edition) ] if len(urn) > 0: urn = URN(urn[0]) else: raise UnknownCollection else: raise InvalidURN try: text = self.inventory[str(urn)] except MyCapytain.errors.UnknownCollection as E: raise UnknownCollection(str(E)) except Exception as E: raise E if os.path.isfile(text.path): resource = self.read(identifier=urn, path=text.path) else: resource = None raise UnknownCollection("File matching %s does not exist" % text.path) return resource, text def getMetadata(self, objectId=None, **filters): """ Request metadata about a text or a collection :param objectId: Object Identifier to filter on :type objectId: str :param filters: Kwargs parameters. :type filters: dict :return: Collection """ return self.get_or( _cache_key("Nautilus", self.name, "GetMetadata", objectId), super(NautilusCTSResolver, self).getMetadata, objectId ) def getReffs(self, textId, level=1, subreference=None): """ Retrieve the siblings of a textual node :param textId: PrototypeText Identifier :type textId: str :param level: Depth for retrieval :type level: int :param subreference: Passage Reference :type subreference: str :return: List of references :rtype: [str] """ return self.get_or( self.__cache_key_reffs__(textId, level, subreference), super(NautilusCTSResolver, self).getReffs, textId, level, subreference ) def __cache_key_reffs__(self, textId, level, subreference): return _cache_key("Nautilus", self.name, "getReffs", textId, level, subreference) def getTextualNode(self, textId, subreference=None, prevnext=False, metadata=False): """ Retrieve a text node from the API :param textId: PrototypeText Identifier :type textId: str :param subreference: Passage Reference :type subreference: str :param prevnext: Retrieve graph representing previous and next passage :type prevnext: boolean :param metadata: Retrieve metadata about the passage and the text :type metadata: boolean :return: Passage :rtype: Passage """ key = _cache_key("Nautilus", self.name, "Passage", textId, subreference) o = self.cache.get(key) if o is not None: return o text, text_metadata = self.__getText__(textId) if subreference is not None: subreference = Reference(subreference) passage = text.getTextualNode(subreference) passage.set_metadata_from_collection(text_metadata) self.cache.set(key, passage) return passage def getSiblings(self, textId, subreference): """ Retrieve the siblings of a textual node :param textId: PrototypeText Identifier :type textId: str :param subreference: Passage Reference :type subreference: str :return: Tuple of references :rtype: (str, str) """ key = _cache_key("Nautilus", self.name, "Siblings", textId, subreference) o = self.cache.get(key) if o is not None: return o passage = self.getTextualNode(textId, subreference, prevnext=True) siblings = passage.siblingsId self.cache.set(key, siblings) return siblings