Added support for text search and text_score.

2014-07-07 20:24:37 -03:00
parent c6e846e0ae
commit f7ebf8dedd
5 changed files with 449 additions and 197 deletions
--- a/docs/guide/index.rst
+++ b/docs/guide/index.rst
@@ -12,3 +12,4 @@ User Guide
   querying
   gridfs
   signals
+   text-indexes
--- a/docs/guide/text-indexes.rst
+++ b/docs/guide/text-indexes.rst
@@ -0,0 +1,47 @@
+===========
+Text Search
+===========
+
+After MongoDB 2.4 version, supports search documents by text indexes.
+
+
+Defining a Document with text index
+===================================
+Use the *$* prefix to set a text index, Look the declaration::
+  
+  class News(Document):
+      title = StringField()
+      content = StringField()
+      is_active = BooleanField()
+
+      meta = {'indexes': [
+          {'fields': ['$title', "$content"],
+           'default_language': 'english',
+           'weight': {'title': 10, 'content': 2}
+          }
+      ]}
+
+
+
+Querying
+========
+
+Saving a document::
+
+  News(title="Using mongodb text search",
+       content="Testing text search").save()
+
+  News(title="MongoEngine 0.9 released",
+       content="Various improvements").save()
+
+Next, start a text search using :attr:`QuerySet.search_text` method::
+  
+  document = News.objects.search_text('testing').first()
+  document.title # may be: "Using mongodb text search"
+  
+  document = News.objects.search_text('released').first()
+  document.title # may be: "MongoEngine 0.9 released"
+
+
+
+
--- a/mongoengine/document.py
+++ b/mongoengine/document.py
@@ -41,6 +41,7 @@ class InvalidCollectionError(Exception):


 class EmbeddedDocument(BaseDocument):
+
    """A :class:`~mongoengine.Document` that isn't stored in its own
    collection.  :class:`~mongoengine.EmbeddedDocument`\ s should be used as
    fields on :class:`~mongoengine.Document`\ s through the
@@ -59,7 +60,7 @@ class EmbeddedDocument(BaseDocument):

    # The __metaclass__ attribute is removed by 2to3 when running with Python3
    # my_metaclass is defined so that metaclass can be queried in Python 2 & 3
-    my_metaclass  = DocumentMetaclass
+    my_metaclass = DocumentMetaclass
    __metaclass__ = DocumentMetaclass

    def __init__(self, *args, **kwargs):
@@ -77,6 +78,7 @@ class EmbeddedDocument(BaseDocument):


 class Document(BaseDocument):
+
    """The base class used for defining the structure and properties of
    collections of documents stored in MongoDB. Inherit from this class, and
    add fields as class attributes to define a document's structure.
@@ -124,14 +126,15 @@ class Document(BaseDocument):

    # The __metaclass__ attribute is removed by 2to3 when running with Python3
    # my_metaclass is defined so that metaclass can be queried in Python 2 & 3
-    my_metaclass  = TopLevelDocumentMetaclass
+    my_metaclass = TopLevelDocumentMetaclass
    __metaclass__ = TopLevelDocumentMetaclass

-    __slots__ = ('__objects' )
+    __slots__ = ('__objects')

    def pk():
        """Primary key alias
        """
+
        def fget(self):
            return getattr(self, self._meta['id_field'])

@@ -140,6 +143,13 @@ class Document(BaseDocument):
        return property(fget, fset)
    pk = pk()

+    @property
+    def text_score(self):
+        """
+        Used for text searchs
+        """
+        return self._data.get('text_score')
+
    @classmethod
    def _get_db(cls):
        """Some Model using other db_alias"""
@@ -165,7 +175,7 @@ class Document(BaseDocument):
                    if options.get('max') != max_documents or \
                       options.get('size') != max_size:
                        msg = (('Cannot create collection "%s" as a capped '
-                               'collection as it already exists')
+                                'collection as it already exists')
                               % cls._collection)
                        raise InvalidCollectionError(msg)
                else:
@@ -282,9 +292,9 @@ class Document(BaseDocument):
                                                   upsert=upsert, **write_concern)
                    created = is_new_object(last_error)

-
            if cascade is None:
-                cascade = self._meta.get('cascade', False) or cascade_kwargs is not None
+                cascade = self._meta.get(
+                    'cascade', False) or cascade_kwargs is not None

            if cascade:
                kwargs = {
@@ -377,7 +387,8 @@ class Document(BaseDocument):
                    del(query["_cls"])
                return self._qs.filter(**query).update_one(**kwargs)
            else:
-                raise OperationError('attempt to update a document not yet saved')
+                raise OperationError(
+                    'attempt to update a document not yet saved')

        # Need to add shard key to query, or you get an error
        return self._qs.filter(**self._object_key).update_one(**kwargs)
@@ -396,7 +407,8 @@ class Document(BaseDocument):
        signals.pre_delete.send(self.__class__, document=self)

        try:
-            self._qs.filter(**self._object_key).delete(write_concern=write_concern, _from_doc_delete=True)
+            self._qs.filter(
+                **self._object_key).delete(write_concern=write_concern, _from_doc_delete=True)
        except pymongo.errors.OperationFailure, err:
            message = u'Could not delete document (%s)' % err.message
            raise OperationError(message)
@@ -483,8 +495,8 @@ class Document(BaseDocument):
        if not self.pk:
            raise self.DoesNotExist("Document does not exist")
        obj = self._qs.read_preference(ReadPreference.PRIMARY).filter(
-                    **self._object_key).only(*fields).limit(1
-                    ).select_related(max_depth=max_depth)
+            **self._object_key).only(*fields).limit(1
+                                                    ).select_related(max_depth=max_depth)

        if obj:
            obj = obj[0]
@@ -528,8 +540,8 @@ class Document(BaseDocument):
        object.
        """
        classes = [get_document(class_name)
-                    for class_name in cls._subclasses
-                    if class_name != cls.__name__] + [cls]
+                   for class_name in cls._subclasses
+                   if class_name != cls.__name__] + [cls]
        documents = [get_document(class_name)
                     for class_name in document_cls._subclasses
                     if class_name != document_cls.__name__] + [document_cls]
@@ -551,7 +563,7 @@ class Document(BaseDocument):

    @classmethod
    def ensure_index(cls, key_or_list, drop_dups=False, background=False,
-        **kwargs):
+                     **kwargs):
        """Ensure that the given indexes are in place.

        :param key_or_list: a single index key or a list of index keys (to
@@ -606,7 +618,7 @@ class Document(BaseDocument):
        # If _cls is being used (for polymorphism), it needs an index,
        # only if another index doesn't begin with _cls
        if (index_cls and not cls_indexed and
-           cls._meta.get('allow_inheritance', ALLOW_INHERITANCE) is True):
+                cls._meta.get('allow_inheritance', ALLOW_INHERITANCE) is True):
            collection.ensure_index('_cls', background=background,
                                    **index_opts)

@@ -621,24 +633,25 @@ class Document(BaseDocument):

        # get all the base classes, subclasses and sieblings
        classes = []
+
        def get_classes(cls):

            if (cls not in classes and
-               isinstance(cls, TopLevelDocumentMetaclass)):
+                    isinstance(cls, TopLevelDocumentMetaclass)):
                classes.append(cls)

            for base_cls in cls.__bases__:
                if (isinstance(base_cls, TopLevelDocumentMetaclass) and
-                   base_cls != Document and
-                   not base_cls._meta.get('abstract') and
-                   base_cls._get_collection().full_name == cls._get_collection().full_name and
-                   base_cls not in classes):
+                        base_cls != Document and
+                        not base_cls._meta.get('abstract') and
+                        base_cls._get_collection().full_name == cls._get_collection().full_name and
+                        base_cls not in classes):
                    classes.append(base_cls)
                    get_classes(base_cls)
            for subclass in cls.__subclasses__():
                if (isinstance(base_cls, TopLevelDocumentMetaclass) and
-                   subclass._get_collection().full_name == cls._get_collection().full_name and
-                   subclass not in classes):
+                        subclass._get_collection().full_name == cls._get_collection().full_name and
+                        subclass not in classes):
                    classes.append(subclass)
                    get_classes(subclass)

@@ -666,8 +679,8 @@ class Document(BaseDocument):
        if [(u'_id', 1)] not in indexes:
            indexes.append([(u'_id', 1)])
        if (cls._meta.get('index_cls', True) and
-           cls._meta.get('allow_inheritance', ALLOW_INHERITANCE) is True):
-             indexes.append([(u'_cls', 1)])
+                cls._meta.get('allow_inheritance', ALLOW_INHERITANCE) is True):
+            indexes.append([(u'_cls', 1)])

        return indexes

@@ -678,7 +691,8 @@ class Document(BaseDocument):
        """

        required = cls.list_indexes()
-        existing = [info['key'] for info in cls._get_collection().index_information().values()]
+        existing = [info['key']
+                    for info in cls._get_collection().index_information().values()]
        missing = [index for index in required if index not in existing]
        extra = [index for index in existing if index not in required]

@@ -696,6 +710,7 @@ class Document(BaseDocument):


 class DynamicDocument(Document):
+
    """A Dynamic Document class allowing flexible, expandable and uncontrolled
    schemas.  As a :class:`~mongoengine.Document` subclass, acts in the same
    way as an ordinary document but has expando style properties.  Any data
@@ -711,7 +726,7 @@ class DynamicDocument(Document):

    # The __metaclass__ attribute is removed by 2to3 when running with Python3
    # my_metaclass is defined so that metaclass can be queried in Python 2 & 3
-    my_metaclass  = TopLevelDocumentMetaclass
+    my_metaclass = TopLevelDocumentMetaclass
    __metaclass__ = TopLevelDocumentMetaclass

    _dynamic = True
@@ -727,6 +742,7 @@ class DynamicDocument(Document):


 class DynamicEmbeddedDocument(EmbeddedDocument):
+
    """A Dynamic Embedded Document class allowing flexible, expandable and
    uncontrolled schemas. See :class:`~mongoengine.DynamicDocument` for more
    information about dynamic documents.
@@ -734,7 +750,7 @@ class DynamicEmbeddedDocument(EmbeddedDocument):

    # The __metaclass__ attribute is removed by 2to3 when running with Python3
    # my_metaclass is defined so that metaclass can be queried in Python 2 & 3
-    my_metaclass  = DocumentMetaclass
+    my_metaclass = DocumentMetaclass
    __metaclass__ = DocumentMetaclass

    _dynamic = True
@@ -753,6 +769,7 @@ class DynamicEmbeddedDocument(EmbeddedDocument):


 class MapReduceDocument(object):
+
    """A document returned from a map/reduce query.

    :param collection: An instance of :class:`~pymongo.Collection`
@@ -783,7 +800,7 @@ class MapReduceDocument(object):
            try:
                self.key = id_field_type(self.key)
            except:
-                raise Exception("Could not cast key as %s" % \
+                raise Exception("Could not cast key as %s" %
                                id_field_type.__name__)

        if not hasattr(self, "_key_object"):
--- a/mongoengine/queryset/base.py
+++ b/mongoengine/queryset/base.py
@@ -39,6 +39,7 @@ RE_TYPE = type(re.compile(''))


 class BaseQuerySet(object):
+
    """A set of results returned from a query. Wraps a MongoDB cursor,
    providing :class:`~mongoengine.Document` objects as the results.
    """
@@ -64,6 +65,8 @@ class BaseQuerySet(object):
        self._none = False
        self._as_pymongo = False
        self._as_pymongo_coerce = False
+        self._search_text = None
+        self._include_text_scores = False

        # If inheritance is allowed, only return instances and instances of
        # subclasses of the class being used
@@ -71,7 +74,8 @@ class BaseQuerySet(object):
            if len(self._document._subclasses) == 1:
                self._initial_query = {"_cls": self._document._subclasses[0]}
            else:
-                self._initial_query = {"_cls": {"$in": self._document._subclasses}}
+                self._initial_query = {
+                    "_cls": {"$in": self._document._subclasses}}
            self._loaded_fields = QueryFieldList(always_include=['_cls'])
        self._cursor_obj = None
        self._limit = None
@@ -148,6 +152,7 @@ class BaseQuerySet(object):
                return queryset._get_scalar(
                    queryset._document._from_son(queryset._cursor[key],
                                                 _auto_dereference=self._auto_dereference))
+
            if queryset._as_pymongo:
                return queryset._get_as_pymongo(queryset._cursor[key])
            return queryset._document._from_son(queryset._cursor[key],
@@ -184,6 +189,35 @@ class BaseQuerySet(object):
        """
        return self.__call__(*q_objs, **query)

+    def search_text(self, text, language=None, include_text_scores=False):
+        """
+        Start a text search, using text indexes.
+
+        :param language:  The language that determines the list of stop words
+            for the search and the rules for the stemmer and tokenizer.
+            If not specified, the search uses the default language of the index.
+            For supported languages, see `Text Search Languages <http://docs.mongodb.org/manual/reference/text-search-languages/#text-search-languages>`.
+
+        :param include_text_scores: If True, automaticaly add a text_score attribute to Document.
+
+        """
+        queryset = self.clone()
+        if queryset._search_text:
+            raise OperationError(
+                "Is not possible to use search_text two times.")
+
+        query_kwargs = {'$search': text}
+        if language:
+            query_kwargs['$language'] = language
+
+        queryset._query_obj &= Q(__raw__={'$text': query_kwargs})
+        queryset._mongo_query = None
+        queryset._cursor_obj = None
+        queryset._search_text = text
+        queryset._include_text_scores = include_text_scores
+
+        return queryset
+
    def get(self, *q_objs, **query):
        """Retrieve the the matching object raising
        :class:`~mongoengine.queryset.MultipleObjectsReturned` or
@@ -322,10 +356,10 @@ class BaseQuerySet(object):
        try:
            ids = self._collection.insert(raw, **write_concern)
        except pymongo.errors.DuplicateKeyError, err:
-            message = 'Could not save document (%s)';
+            message = 'Could not save document (%s)'
            raise NotUniqueError(message % unicode(err))
        except pymongo.errors.OperationFailure, err:
-            message = 'Could not save document (%s)';
+            message = 'Could not save document (%s)'
            if re.match('^E1100[01] duplicate key', unicode(err)):
                # E11000 - duplicate key error index
                # E11001 - duplicate key on update
@@ -408,7 +442,7 @@ class BaseQuerySet(object):
                ref_q = document_cls.objects(**{field_name + '__in': self})
                ref_q_count = ref_q.count()
                if (doc != document_cls and ref_q_count > 0
-                    or (doc == document_cls and ref_q_count > 0)):
+                        or (doc == document_cls and ref_q_count > 0)):
                    ref_q.delete(write_concern=write_concern)
            elif rule == NULLIFY:
                document_cls.objects(**{field_name + '__in': self}).update(
@@ -418,7 +452,8 @@ class BaseQuerySet(object):
                    write_concern=write_concern,
                    **{'pull_all__%s' % field_name: self})

-        queryset._collection.remove(queryset._query, write_concern=write_concern)
+        queryset._collection.remove(
+            queryset._query, write_concern=write_concern)

    def update(self, upsert=False, multi=True, write_concern=None,
               full_result=False, **update):
@@ -515,7 +550,8 @@ class BaseQuerySet(object):
            raise OperationError("Conflicting parameters: remove and new")

        if not update and not upsert and not remove:
-            raise OperationError("No update parameters, must either update or remove")
+            raise OperationError(
+                "No update parameters, must either update or remove")

        queryset = self.clone()
        query = queryset._query
@@ -622,13 +658,15 @@ class BaseQuerySet(object):
          :class:`~mongoengine.queryset.base.BaseQuerySet` into another child class
        """
        if not isinstance(cls, BaseQuerySet):
-            raise OperationError('%s is not a subclass of BaseQuerySet' % cls.__name__)
+            raise OperationError(
+                '%s is not a subclass of BaseQuerySet' % cls.__name__)

        copy_props = ('_mongo_query', '_initial_query', '_none', '_query_obj',
                      '_where_clause', '_loaded_fields', '_ordering', '_snapshot',
                      '_timeout', '_class_check', '_slave_okay', '_read_preference',
                      '_iter', '_scalar', '_as_pymongo', '_as_pymongo_coerce',
-                      '_limit', '_skip', '_hint', '_auto_dereference')
+                      '_limit', '_skip', '_hint', '_auto_dereference',
+                      '_search_text', '_include_text_scores')

        for prop in copy_props:
            val = getattr(self, prop)
@@ -714,11 +752,14 @@ class BaseQuerySet(object):
            distinct = self._dereference(queryset._cursor.distinct(field), 1,
                                         name=field, instance=self._document)

-            # We may need to cast to the correct type eg. ListField(EmbeddedDocumentField)
-            doc_field = getattr(self._document._fields.get(field), "field", None)
+            # We may need to cast to the correct type eg.
+            # ListField(EmbeddedDocumentField)
+            doc_field = getattr(
+                self._document._fields.get(field), "field", None)
            instance = getattr(doc_field, "document_type", False)
            EmbeddedDocumentField = _import_class('EmbeddedDocumentField')
-            GenericEmbeddedDocumentField = _import_class('GenericEmbeddedDocumentField')
+            GenericEmbeddedDocumentField = _import_class(
+                'GenericEmbeddedDocumentField')
            if instance and isinstance(doc_field, (EmbeddedDocumentField,
                                                   GenericEmbeddedDocumentField)):
                distinct = [instance(**doc) for doc in distinct]
@@ -799,7 +840,8 @@ class BaseQuerySet(object):
        for value, group in itertools.groupby(fields, lambda x: x[1]):
            fields = [field for field, value in group]
            fields = queryset._fields_to_dbfields(fields)
-            queryset._loaded_fields += QueryFieldList(fields, value=value, _only_called=_only_called)
+            queryset._loaded_fields += QueryFieldList(
+                fields, value=value, _only_called=_only_called)

        return queryset

@@ -1036,7 +1078,6 @@ class BaseQuerySet(object):
                    ordered_output.append(('db', get_db(db_alias).name))
                    del remaing_args[0]

-
                for part in remaing_args:
                    value = output.get(part)
                    if value:
@@ -1292,6 +1333,13 @@ class BaseQuerySet(object):
            cursor_args['slave_okay'] = self._slave_okay
        if self._loaded_fields:
            cursor_args['fields'] = self._loaded_fields.as_dict()
+
+        if self._include_text_scores:
+            if 'fields' not in cursor_args:
+                cursor_args['fields'] = {}
+
+            cursor_args['fields']['text_score'] = {'$meta': "textScore"}
+
        return cursor_args

    @property
--- a/tests/queryset/queryset.py
+++ b/tests/queryset/queryset.py