Source code for libreantdb.api

from elasticsearch import NotFoundError
from elasticsearch.helpers import scan

import logging
log = logging.getLogger(__name__)


[docs]def validate_book(body): ''' This does not only accept/refuse a book. It also returns an ENHANCED version of body, with (mostly fts-related) additional fields. This function is idempotent. ''' if '_language' not in body: raise ValueError('language needed') if len(body['_language']) > 2: raise ValueError('invalid language: %s' % body['_language']) # remove old _text_* fields for k in body.keys(): if k.startswith('_text'): del(body[k]) allfields = collectStrings(body) body['_text_%s' % body['_language']] = ' '.join(allfields) return body
[docs]def collectStrings(leftovers): strings = [] if isinstance(leftovers, basestring): return leftovers.split() elif isinstance(leftovers, list): for l in leftovers: strings.extend(collectStrings(l)) return strings elif isinstance(leftovers, dict): for key, value in leftovers.items(): if not key.startswith('_'): strings.extend(collectStrings(value)) return strings else: return strings
[docs]class DB(object): ''' This class contains every query method and every operation on the index The following elasticsearch body response example provides the typical structure of a single document. .. code-block:: ruby { "_index" : "libreant", "_type" : "book", "_id" : "AU4RleAfD1zQdqx6OQ8Y", "_version" : 1, "found" : true, "_source": {"_language": "en", "_text_en": "marco belletti pdf file latex manual", "author": "marco belletti", "type": "pdf file", "title": "latex manual", "_attachments": [{"sha1": "dc8dc34b3e0fec2377e5cf9ea7e4780d87ff18c5", "name": "LaTeX_Wikibook.pdf", "url": "fsdb:///dc8dc34b3e0fec2377e5cf9ea7e4780d87ff18c5", "notes": "A n example bookLatex wikibook", "mime": "application/pdf", "download_count": 7, "id": "17fd3d898a834e2689340cc8aacdebb4", "size": 23909451}] } } ''' # Setup {{{2 def __init__(self, es, index_name): self.es = es self.index_name = index_name # book_validator can adjust the book, and raise if it's not valid self.book_validator = validate_book
[docs] def setup_db(self, wait_for_ready=True): ''' Create and configure index If `wait_for_ready` is True, this function will block until status for `self.index_name` will be `yellow` ''' maps = { 'book': { # this need to be the document type! # special elasticsearch field # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-timestamp-field.html # initialized with element creation date, hidden by default in query result "_timestamp": {"enabled": "true", "store": "yes"}, "properties": { "_text_en": { "type": "string", "analyzer": "english"}, "_text_it": { "type": "string", "analyzer": "it_analyzer"} } } } # Just like the default one # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#italian-analyzer # but the stemmer changed from light_italian to italian settings = {"analysis": { "filter": { "italian_elision": { "type": "elision", "articles": [ "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d" ] }, "italian_stop": { "type": "stop", "stopwords": "_italian_"}, "italian_stemmer": { "type": "stemmer", "language": "italian"} }, "analyzer": { "it_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "italian_elision", "lowercase", "italian_stop", "italian_stemmer" ] } } }} if not self.es.indices.exists(self.index_name): self.es.indices.create(index=self.index_name, body={'settings': settings, 'mappings': maps}) if wait_for_ready: log.debug('waiting for index "{}" to be ready'.format(self.index_name)) self.es.cluster.health(index=self.index_name, level='index', wait_for_status='yellow') log.debug('index "{}" is now ready'.format(self.index_name)) # End setup }} # Queries {{{2
def __len__(self): return self.es.count(index=self.index_name)['count'] def _search(self, body, **kargs): return self.es.search(index=self.index_name, body=body, **kargs) def _get_search_field(self, field, value): return {'query': {'match': {field: value}} }
[docs] def mlt(self, _id): ''' High-level method to do "more like this". Its exact implementation can vary. ''' query = {'more_like_this': { # FIXME: text_* does not seem to work, so we're relying on listing # them manually 'fields': ['book._text_it', 'book._text_en'], 'ids': [_id], 'min_term_freq': 1, 'min_doc_freq': 1, }} return self._search(dict(query=query))
[docs] def get_all_books(self, size=30): return self._search({}, size=size)
[docs] def iterate_all(self): return scan(self.es, index=self.index_name)
[docs] def get_last_inserted(self, size=30): query = {"fields": ["_timestamp", "_source"], "query": {"match_all": {}}, "sort": [{"_timestamp": "desc"}]} return self._search(body=query, size=size)
[docs] def get_books_simplequery(self, query): return self._search(self._get_search_field('_all', query))
[docs] def get_books_multilanguage(self, query): return self._search({'query': {'multi_match': {'query': query, 'fields': '_text_*'} }})
[docs] def get_books_by_title(self, title): return self._search(self._get_search_field('title', title))
[docs] def get_books_by_actor(self, authorname): return self._search(self._get_search_field('actors', authorname))
[docs] def get_book_by_id(self, id): return self.es.get(index=self.index_name, id=id)
[docs] def get_books_querystring(self, query, **kargs): q = {'query': query, 'fields': ['_text_*']} return self._search({'query': dict(query_string=q)}, **kargs)
[docs] def file_is_attached(self, url): '''return true if at least one book has file with the given url as attachment ''' body = self._get_search_field('_attachments.url', url) return self.es.count(index=self.index_name, body=body)['count'] > 0
[docs] def autocomplete(self, fieldname, start): raise NotImplementedError() # End queries }}} # Operations {{{2
[docs] def add_book(self, **book): ''' Call it like this: db.add_book(doc_type='book', body={'title': 'foobar', '_language': 'it'}) ''' if 'doc_type' not in book: book['doc_type'] = 'book' book['body'] = validate_book(book['body']) return self.es.create(index=self.index_name, **book)
[docs] def delete_book(self, id): self.es.delete(index=self.index_name, id=id, doc_type='book')
[docs] def update_book(self, id, body, doc_type='book'): ''' Update a book The "body" is merged with the current one. Yes, it is NOT overwritten. In case of concurrency conflict this function could raise `elasticsearch.ConflictError` ''' # note that we are NOT overwriting all the _source, just merging book = self.get_book_by_id(id) book['_source'].update(body) validated = validate_book(book['_source']) ret = self.es.index(index=self.index_name, id=id, doc_type=doc_type, body=validated, version=book['_version']) return ret
[docs] def modify_book(self, id, body, doc_type='book', version=None): ''' replace the entire book body Instead of `update_book` this function will overwrite the book content with param body If param `version` is given, it will be checked that the changes are applied upon that document version. If the document version provided is different from the one actually found, an `elasticsearch.ConflictError` will be raised ''' validatedBody = validate_book(body) params = dict(index=self.index_name, id=id, doc_type=doc_type, body=validatedBody) if version: params['version'] = version ret = self.es.index(**params) return ret
[docs] def increment_download_count(self, id, attachmentID, doc_type='book'): ''' Increment the download counter of a specific file ''' body = self.es.get(index=self.index_name, id=id, doc_type='book', _source_include='_attachments')['_source'] for attachment in body['_attachments']: if attachment['id'] == attachmentID: attachment['download_count'] += 1 self.es.update(index=self.index_name, id=id, doc_type=doc_type, body={"doc": {'_attachments': body['_attachments']}}) return raise NotFoundError("No attachment could be found with id: {}".format(attachmentID)) # End operations }}} # vim: set fdm=marker fdl=1: