Source code for libreantdb.api

from elasticsearch import NotFoundError
from elasticsearch.helpers import scan

import logging
log = logging.getLogger(__name__)


[docs]def validate_book(body):
    '''
    This does not only accept/refuse a book. It also returns an ENHANCED
    version of body, with (mostly fts-related) additional fields.

    This function is idempotent.
    '''
    if '_language' not in body:
        raise ValueError('language needed')
    if len(body['_language']) > 2:
        raise ValueError('invalid language: %s' % body['_language'])

    # remove old _text_* fields
    for k in body.keys():
        if k.startswith('_text'):
            del(body[k])

    allfields = collectStrings(body)
    body['_text_%s' % body['_language']] = ' '.join(allfields)
    return body


[docs]def collectStrings(leftovers):
    strings = []
    if isinstance(leftovers, basestring):
        return leftovers.split()
    elif isinstance(leftovers, list):
        for l in leftovers:
            strings.extend(collectStrings(l))
        return strings
    elif isinstance(leftovers, dict):
        for key, value in leftovers.items():
            if not key.startswith('_'):
                strings.extend(collectStrings(value))
        return strings
    else:
        return strings


[docs]class DB(object):
    '''
    This class contains every query method and every operation on the index

    The following elasticsearch body response example provides the typical structure of a single document.

    .. code-block:: ruby

        {
          "_index" : "libreant",
          "_type" : "book",
          "_id" : "AU4RleAfD1zQdqx6OQ8Y",
          "_version" : 1,
          "found" : true,
          "_source": {"_language": "en",
                      "_text_en": "marco belletti pdf file latex manual",
                      "author": "marco belletti",
                      "type": "pdf file",
                      "title": "latex manual",
                      "_attachments": [{"sha1": "dc8dc34b3e0fec2377e5cf9ea7e4780d87ff18c5",
                                        "name": "LaTeX_Wikibook.pdf",
                                        "url": "fsdb:///dc8dc34b3e0fec2377e5cf9ea7e4780d87ff18c5",
                                        "notes": "A n example bookLatex wikibook",
                                        "mime": "application/pdf",
                                        "download_count": 7,
                                        "id": "17fd3d898a834e2689340cc8aacdebb4",
                                        "size": 23909451}]
                     }
        }
    '''
    # Setup {{{2
    def __init__(self, es, index_name):
        self.es = es
        self.index_name = index_name
        # book_validator can adjust the book, and raise if it's not valid
        self.book_validator = validate_book

[docs]    def setup_db(self, wait_for_ready=True):
        ''' Create and configure index

            If `wait_for_ready` is True, this function will block until
            status for `self.index_name` will be `yellow`
        '''
        maps = {
            'book': {  # this need to be the document type!
                # special elasticsearch field
                # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-timestamp-field.html
                # initialized with element creation date, hidden by default in query result
                "_timestamp": {"enabled": "true",
                               "store": "yes"},
                "properties": {
                    "_text_en": {
                        "type": "string",
                        "analyzer": "english"},
                    "_text_it": {
                        "type": "string",
                        "analyzer": "it_analyzer"}
                }
            }
        }

        # Just like the default one
        # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#italian-analyzer
        # but the stemmer changed from light_italian to italian
        settings = {"analysis": {
            "filter": {
                "italian_elision": {
                    "type": "elision",
                    "articles": [
                        "c", "l", "all", "dall", "dell",
                        "nell", "sull", "coll", "pell",
                        "gl", "agl", "dagl", "degl", "negl",
                        "sugl", "un", "m", "t", "s", "v", "d"
                    ]
                },
                "italian_stop": {
                    "type": "stop", "stopwords": "_italian_"},
                "italian_stemmer": {
                    "type": "stemmer", "language": "italian"}
            },
            "analyzer": {
                "it_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "italian_elision",
                        "lowercase",
                        "italian_stop",
                        "italian_stemmer"
                    ]
                }
            }
        }}

        if not self.es.indices.exists(self.index_name):
            self.es.indices.create(index=self.index_name,
                                   body={'settings': settings,
                                         'mappings': maps})
        if wait_for_ready:
            log.debug('waiting for index "{}" to be ready'.format(self.index_name))
            self.es.cluster.health(index=self.index_name, level='index', wait_for_status='yellow')
            log.debug('index "{}" is now ready'.format(self.index_name))
    # End setup }}

    # Queries {{{2
    def __len__(self):
        return self.es.count(index=self.index_name)['count']

    def _search(self, body, **kargs):
        return self.es.search(index=self.index_name, body=body, **kargs)

    def _get_search_field(self, field, value):
        return {'query':
                {'match': {field: value}}
                }

[docs]    def mlt(self, _id):
        '''
        High-level method to do "more like this".

        Its exact implementation can vary.
        '''
        query = {'more_like_this': {
            # FIXME: text_* does not seem to work, so we're relying on listing
            # them manually
            'fields': ['book._text_it', 'book._text_en'],
            'ids': [_id],
            'min_term_freq': 1,
            'min_doc_freq': 1,
        }}
        return self._search(dict(query=query))

[docs]    def get_all_books(self, size=30):
        return self._search({}, size=size)

[docs]    def iterate_all(self):
        return scan(self.es, index=self.index_name)

[docs]    def get_last_inserted(self, size=30):
        query = {"fields": ["_timestamp", "_source"],
                 "query": {"match_all": {}},
                 "sort": [{"_timestamp": "desc"}]}
        return self._search(body=query, size=size)

[docs]    def get_books_simplequery(self, query):
        return self._search(self._get_search_field('_all', query))

[docs]    def get_books_multilanguage(self, query):
        return self._search({'query': {'multi_match':
                                       {'query': query, 'fields': '_text_*'}
                                       }})

[docs]    def get_books_by_title(self, title):
        return self._search(self._get_search_field('title', title))

[docs]    def get_books_by_actor(self, authorname):
        return self._search(self._get_search_field('actors', authorname))

[docs]    def get_book_by_id(self, id):
        return self.es.get(index=self.index_name, id=id)

[docs]    def get_books_querystring(self, query, **kargs):
        q = {'query': query, 'fields': ['_text_*']}
        return self._search({'query': dict(query_string=q)}, **kargs)

[docs]    def user_search(self, query):
        '''
        This acts like a "wrapper" that always point to the recommended
        function for user searching.
        '''
        return self.get_books_querystring(query)

[docs]    def file_is_attached(self, url):
        '''return true if at least one book has
           file with the given url as attachment
        '''
        body = self._get_search_field('_attachments.url', url)
        return self.es.count(index=self.index_name, body=body)['count'] > 0

[docs]    def autocomplete(self, fieldname, start):
        raise NotImplementedError()
    # End queries }}}

    # Operations {{{2
[docs]    def add_book(self, **book):
        '''
        Call it like this:
            db.add_book(doc_type='book',
            body={'title': 'foobar', '_language': 'it'})
        '''
        if 'doc_type' not in book:
            book['doc_type'] = 'book'
        book['body'] = validate_book(book['body'])
        return self.es.create(index=self.index_name, **book)

[docs]    def delete_book(self, id):
        self.es.delete(index=self.index_name,
                       id=id,
                       doc_type='book')

[docs]    def update_book(self, id, body, doc_type='book'):
        ''' Update a book

            The "body" is merged with the current one.
            Yes, it is NOT overwritten.

            In case of concurrency conflict
            this function could raise `elasticsearch.ConflictError`
        '''
        # note that we are NOT overwriting all the _source, just merging
        book = self.get_book_by_id(id)
        book['_source'].update(body)
        validated = validate_book(book['_source'])
        ret = self.es.index(index=self.index_name, id=id,
                            doc_type=doc_type, body=validated, version=book['_version'])
        return ret

[docs]    def modify_book(self, id, body, doc_type='book', version=None):
        ''' replace the entire book body

            Instead of `update_book` this function
            will overwrite the book content with param body

            If param `version` is given, it will be checked that the
            changes are applied upon that document version.
            If the document version provided is different from the one actually found,
            an `elasticsearch.ConflictError` will be raised
        '''
        validatedBody = validate_book(body)
        params = dict(index=self.index_name, id=id, doc_type=doc_type, body=validatedBody)
        if version:
            params['version'] = version
        ret = self.es.index(**params)
        return ret

[docs]    def increment_download_count(self, id, attachmentID, doc_type='book'):
        '''
        Increment the download counter of a specific file
        '''
        body = self.es.get(index=self.index_name, id=id, doc_type='book', _source_include='_attachments')['_source']

        for attachment in body['_attachments']:
            if attachment['id'] == attachmentID:
                attachment['download_count'] += 1
                self.es.update(index=self.index_name,
                               id=id,
                               doc_type=doc_type,
                               body={"doc": {'_attachments': body['_attachments']}})
                return
        raise NotFoundError("No attachment could be found with id: {}".format(attachmentID))

    # End operations }}}

# vim: set fdm=marker fdl=1: