minor comments

Fix iteration within iteration
2016-12-05 00:03:19 -05:00 · 2016-12-04 23:55:25 -05:00
3 changed files with 92 additions and 57 deletions
--- a/mongoengine/queryset/base.py
+++ b/mongoengine/queryset/base.py
@@ -82,7 +82,6 @@ class BaseQuerySet(object):
        self._limit = None
        self._skip = None
        self._hint = -1  # Using -1 as None is a valid value for hint
-        self._batch_size = None
        self.only_fields = []
        self._max_time_ms = None

@@ -276,6 +275,8 @@ class BaseQuerySet(object):
        except StopIteration:
            return result

+        # If we were able to retrieve the 2nd doc, rewind the cursor and
+        # raise the MultipleObjectsReturned exception.
        queryset.rewind()
        message = u'%d items returned, instead of 1' % queryset.count()
        raise queryset._document.MultipleObjectsReturned(message)
@@ -782,19 +783,6 @@ class BaseQuerySet(object):
        queryset._hint = index
        return queryset

-    def batch_size(self, size):
-        """Limit the number of documents returned in a single batch (each
-        batch requires a round trip to the server).
-
-        See http://api.mongodb.com/python/current/api/pymongo/cursor.html#pymongo.cursor.Cursor.batch_size
-        for details.
-
-        :param size: desired size of each batch.
-        """
-        queryset = self.clone()
-        queryset._batch_size = size
-        return queryset
-
    def distinct(self, field):
        """Return a list of distinct values for a given field.

@@ -1481,9 +1469,6 @@ class BaseQuerySet(object):
            if self._hint != -1:
                self._cursor_obj.hint(self._hint)

-            if self._batch_size is not None:
-                self._cursor_obj.batch_size(self._batch_size)
-
        return self._cursor_obj

    def __deepcopy__(self, memo):
--- a/mongoengine/queryset/queryset.py
+++ b/mongoengine/queryset/queryset.py
@@ -27,9 +27,10 @@ class QuerySet(BaseQuerySet):
        in batches of ``ITER_CHUNK_SIZE``.

        If ``self._has_more`` the cursor hasn't been exhausted so cache then
-        batch.  Otherwise iterate the result_cache.
+        batch. Otherwise iterate the result_cache.
        """
        self._iter = True
+
        if self._has_more:
            return self._iter_results()

@@ -42,10 +43,12 @@ class QuerySet(BaseQuerySet):
        """
        if self._len is not None:
            return self._len
+
+        # Populate the result cache with *all* of the docs in the cursor
        if self._has_more:
-            # populate the cache
            list(self._iter_results())

+        # Cache the length of the complete result cache and return it
        self._len = len(self._result_cache)
        return self._len

@@ -64,18 +67,33 @@ class QuerySet(BaseQuerySet):
    def _iter_results(self):
        """A generator for iterating over the result cache.

-        Also populates the cache if there are more possible results to yield.
-        Raises StopIteration when there are no more results"""
+        Also populates the cache if there are more possible results to
+        yield. Raises StopIteration when there are no more results.
+        """
        if self._result_cache is None:
            self._result_cache = []
+
        pos = 0
        while True:
-            upper = len(self._result_cache)
-            while pos < upper:
+
+            # For all positions lower than the length of the current result
+            # cache, serve the docs straight from the cache w/o hitting the
+            # database.
+            # XXX it's VERY important to compute the len within the `while`
+            # condition because the result cache might expand mid-iteration
+            # (e.g. if we call len(qs) inside a loop that iterates over the
+            # queryset). Fortunately len(list) is O(1) in Python, so this
+            # doesn't cause performance issues.
+            while pos < len(self._result_cache):
                yield self._result_cache[pos]
                pos += 1
+
+            # Raise StopIteration if we already established there were no more
+            # docs in the db cursor.
            if not self._has_more:
                raise StopIteration
+
+            # Otherwise, populate more of the cache and repeat.
            if len(self._result_cache) <= pos:
                self._populate_cache()

@@ -86,12 +104,22 @@ class QuerySet(BaseQuerySet):
        """
        if self._result_cache is None:
            self._result_cache = []
-        if self._has_more:
-            try:
-                for i in xrange(ITER_CHUNK_SIZE):
-                    self._result_cache.append(self.next())
-            except StopIteration:
-                self._has_more = False
+
+        # Skip populating the cache if we already established there are no
+        # more docs to pull from the database.
+        if not self._has_more:
+            return
+
+        # Pull in ITER_CHUNK_SIZE docs from the database and store them in
+        # the result cache.
+        try:
+            for i in xrange(ITER_CHUNK_SIZE):
+                self._result_cache.append(self.next())
+        except StopIteration:
+            # Getting this exception means there are no more docs in the
+            # db cursor. Set _has_more to False so that we can use that
+            # information in other places.
+            self._has_more = False

    def count(self, with_limit_and_skip=False):
        """Count the selected elements in the query.
--- a/tests/queryset/queryset.py
+++ b/tests/queryset/queryset.py
@@ -337,34 +337,6 @@ class QuerySetTest(unittest.TestCase):
        query = query.filter(boolfield=True)
        self.assertEqual(query.count(), 1)

-    def test_batch_size(self):
-        """Ensure that batch_size works."""
-        class A(Document):
-            s = StringField()
-
-        A.drop_collection()
-
-        for i in range(100):
-            A.objects.create(s=str(i))
-
-        # test iterating over the result set
-        cnt = 0
-        for a in A.objects.batch_size(10):
-            cnt += 1
-        self.assertEqual(cnt, 100)
-
-        # test chaining
-        qs = A.objects.all()
-        qs = qs.limit(10).batch_size(20).skip(91)
-        cnt = 0
-        for a in qs:
-            cnt += 1
-        self.assertEqual(cnt, 9)
-
-        # test invalid batch size
-        qs = A.objects.batch_size(-1)
-        self.assertRaises(ValueError, lambda: list(qs))
-
    def test_update_write_concern(self):
        """Test that passing write_concern works"""
        self.Person.drop_collection()
@@ -4918,6 +4890,56 @@ class QuerySetTest(unittest.TestCase):

        self.assertEqual(1, Doc.objects(item__type__="axe").count())

+    def test_len_during_iteration(self):
+        """Tests that calling len on a queyset during iteration doesn't
+        stop paging.
+        """
+        class Data(Document):
+            pass
+
+        for i in xrange(300):
+            Data().save()
+
+        records = Data.objects.limit(250)
+
+        # This should pull all 250 docs from mongo and populate the result
+        # cache
+        len(records)
+
+        # Assert that iterating over documents in the qs touches every
+        # document even if we call len(qs) midway through the iteration.
+        for i, r in enumerate(records):
+            if i == 58:
+                len(records)
+        self.assertEqual(i, 249)
+
+        # Assert the same behavior is true even if we didn't pre-populate the
+        # result cache.
+        records = Data.objects.limit(250)
+        for i, r in enumerate(records):
+            if i == 58:
+                len(records)
+        self.assertEqual(i, 249)
+
+    def test_iteration_within_iteration(self):
+        """You should be able to reliably iterate over all the documents
+        in a given queryset even if there are multiple iterations of it
+        happening at the same time.
+        """
+        class Data(Document):
+            pass
+
+        for i in xrange(300):
+            Data().save()
+
+        qs = Data.objects.limit(250)
+        for i, doc in enumerate(qs):
+            for j, doc2 in enumerate(qs):
+                pass
+
+        self.assertEqual(i, 249)
+        self.assertEqual(j, 249)
+

 if __name__ == '__main__':
    unittest.main()
Author	SHA1	Message	Date
Stefan Wojcik	894678da39	minor comments	2016-12-05 00:03:19 -05:00
Stefan Wojcik	0a66a4b8a9	Fix iteration within iteration	2016-12-04 23:55:25 -05:00