scrapy-plugins · Gallaecio · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst
@@ -13,13 +13,11 @@ ZYTE_API_AUTO_FIELD_STATS
 
 Default: ``False``
 
-Enables stats that indicate which requested fields :ref:`obtained through
-scrapy-poet integration <scrapy-poet>` come directly from
-:ref:`zapi-extract`.
+Enables stats that indicate which fields from yielded items come from
+:ref:`zapi-extract` when using :ref:`scrapy-poet integration <scrapy-poet>`.
 
-If for any request no page object class is used to override
-:ref:`zapi-extract` fields for a given item type, the following stat is
-set:
+If for any combination of item type and URL there is no registered page object
+class, the following stat is set:
 
 .. code-block:: python
 
@@ -28,8 +26,9 @@ set:
 .. note:: A literal ``(all fields)`` string is used as value, not a list with
     all fields.
 
-If for any request a custom page object class is used to override some
-:ref:`zapi-extract` fields, the following stat is set:
+When a page object class is registered for a given combination of item type and
+URL, and that page object class overrides some fields, the following stat is
+set:
 
 .. code-block:: python
 
@@ -40,6 +39,32 @@ If for any request a custom page object class is used to override some
 .. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
     whether a field has been overridden or not.
 
+Item URLs are read from the ``url`` field by default. Use
+:setting:`ZYTE_API_AUTO_FIELD_URL_FIELDS` to configure a different field for
+any given item type.
+
+.. setting:: ZYTE_API_AUTO_FIELD_URL_FIELDS
+
+ZYTE_API_AUTO_FIELD_URL_FIELDS
+==============================
+
+Default: ``{}``
+
+Dictionary where keys are item types or their import paths, and values are
+strings with the names of the fields in those item types that indicate the
+source URL of the item.
+
+For example:
+
+.. code-block:: python
+    :caption: settings.py
+
+    ZYTE_API_AUTO_FIELD_URL_FIELDS = {
+        "my_project.items.CustomItem": "custom_url_field",
+    }
+
+If a URL field is not specified for an item, ``url`` is used by default.
+
 .. setting:: ZYTE_API_AUTOMAP_PARAMS
 
 ZYTE_API_AUTOMAP_PARAMS

diff --git a/docs/setup.rst b/docs/setup.rst
@@ -120,6 +120,8 @@ spider to use Zyte API for all requests, set the following setting as well:
 
     ZYTE_API_TRANSPARENT_MODE = True
 
+.. _scrapy-poet-manual-setup:
+
 For :ref:`scrapy-poet integration <scrapy-poet>`, add the following provider to
 the ``SCRAPY_POET_PROVIDERS`` setting:
 
@@ -150,6 +152,17 @@ middleware to the :setting:`DOWNLOADER_MIDDLEWARES
         "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667,
     }
 
+For :setting:`ZYTE_API_AUTO_FIELD_STATS` support, first :ref:`enable
+scrapy-poet integration <scrapy-poet-manual-setup>`, and then add the following
+item pipeline to the :setting:`ITEM_PIPELINES <scrapy:ITEM_PIPELINES>` setting:
+
+.. code-block:: python
+    :caption: settings.py
+
+    ITEM_PIPELINES = {
+        "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline": 0,
+    }
+
 
 .. _reactor-change:
 

diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py
@@ -0,0 +1,88 @@
+from logging import getLogger
+from typing import Any, Set, Type
+
+from itemadapter import ItemAdapter
+from scrapy import Spider
+from scrapy.crawler import Crawler
+from scrapy.exceptions import NotConfigured
+from scrapy.utils.misc import load_object
+from scrapy_poet import InjectionMiddleware
+from web_poet.fields import get_fields_dict
+from web_poet.utils import get_fq_class_name
+from zyte_common_items.fields import is_auto_field
+
+logger = getLogger(__name__)
+
+
+class ScrapyZyteAPIAutoFieldStatsItemPipeline:
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
+
+    def __init__(self, crawler: Crawler):
+        if not crawler.settings.getbool("ZYTE_API_AUTO_FIELD_STATS", False):
+            raise NotConfigured
+
+        raw_url_fields = crawler.settings.getdict("ZYTE_API_AUTO_FIELD_URL_FIELDS", {})
+        self._url_fields = {load_object(k): v for k, v in raw_url_fields.items()}
+        self._seen: Set[Type] = set()
+        self._crawler = crawler
+        self._stats = crawler.stats
+        self._item_cls_without_url: Set[Type] = set()
+
+    def open_spider(self, spider):
+        for component in self._crawler.engine.downloader.middleware.middlewares:
+            if isinstance(component, InjectionMiddleware):
+                self._registry = component.injector.registry
+                return
+        raise RuntimeError(
+            "Could not find scrapy_poet.InjectionMiddleware among downloader "
+            "middlewares. scrapy-poet may be misconfigured."
+        )
+
+    def process_item(self, item: Any, spider: Spider):
+        item_cls = item.__class__
+
+        url_field = self._url_fields.get(item_cls, "url")
+        adapter = ItemAdapter(item)
+        url = adapter.get(url_field, None)
+        if not url:
+            if item_cls not in self._item_cls_without_url:
+                self._item_cls_without_url.add(item_cls)
+                logger.warning(
+                    f"An item of type {item_cls} was missing a non-empty URL "
+                    f"in its {url_field!r} field. An item URL is necessary to "
+                    f"determine the page object that was used to generate "
+                    f"that item, and hence print the auto field stats that "
+                    f"you requested by enabling the ZYTE_API_AUTO_FIELD_STATS "
+                    f"setting. If {url_field!r} is the wrong URL field for "
+                    f"that item type, use the ZYTE_API_AUTO_FIELD_URL_FIELDS "
+                    f"setting to set a different field."
+                )
+            return
+
+        page_cls = self._registry.page_cls_for_item(url, item_cls)
+
+        cls = page_cls or item_cls
+        if cls in self._seen:
+            return
+        self._seen.add(cls)
+
+        if not page_cls:
+            field_list = "(all fields)"
+        else:
+            auto_fields = set()
+            missing_fields = False
+            for field_name in get_fields_dict(page_cls):
+                if is_auto_field(page_cls, field_name):  # type: ignore[arg-type]
+                    auto_fields.add(field_name)
+                else:
+                    missing_fields = True
+            if missing_fields:
+                field_list = " ".join(sorted(auto_fields))
+            else:
+                field_list = "(all fields)"
+
+        cls_fqn = get_fq_class_name(cls)
+        self._stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py
@@ -111,9 +111,13 @@
         except ImportError:
             pass
         else:
+            from scrapy_zyte_api.poet import ScrapyZyteAPIAutoFieldStatsItemPipeline
             from scrapy_zyte_api.providers import ZyteApiProvider
 
             _setdefault(settings, "DOWNLOADER_MIDDLEWARES", InjectionMiddleware, 543)
+            _setdefault(
+                settings, "ITEM_PIPELINES", ScrapyZyteAPIAutoFieldStatsItemPipeline, 0
+            )
             _setdefault(settings, "SCRAPY_POET_PROVIDERS", ZyteApiProvider, 1100)
 
         if settings.getbool("ZYTE_API_SESSION_ENABLED", False):

diff --git a/scrapy_zyte_api/poet.py b/scrapy_zyte_api/poet.py
@@ -0,0 +1 @@
+from ._poet_item_pipelines import ScrapyZyteAPIAutoFieldStatsItemPipeline
diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set
 
 from andi.typeutils import is_typing_annotated, strip_annotated
 from scrapy import Request
@@ -13,8 +13,6 @@
     HttpResponseHeaders,
 )
 from web_poet.annotated import AnnotatedInstance
-from web_poet.fields import get_fields_dict
-from web_poet.utils import get_fq_class_name
 from zyte_common_items import (
     Article,
     ArticleList,
@@ -32,7 +30,6 @@
     ProductList,
     ProductNavigation,
 )
-from zyte_common_items.fields import is_auto_field
 
 from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
 from scrapy_zyte_api._annotations import _ActionResult
@@ -84,38 +81,9 @@ class ZyteApiProvider(PageObjectInputProvider):
         Screenshot,
     }
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._should_track_auto_fields = None
-        self._tracked_auto_fields = set()
-
     def is_provided(self, type_: Callable) -> bool:
         return super().is_provided(strip_annotated(type_))
 
-    def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
-        if cls not in _ITEM_KEYWORDS:
-            return
-        if self._should_track_auto_fields is None:
-            self._should_track_auto_fields = crawler.settings.getbool(
-                "ZYTE_API_AUTO_FIELD_STATS", False
-            )
-        if self._should_track_auto_fields is False:
-            return
-        cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls
-        if cls in self._tracked_auto_fields:
-            return
-        self._tracked_auto_fields.add(cls)
-        if cls in _ITEM_KEYWORDS:
-            field_list = "(all fields)"
-        else:
-            auto_fields = set()
-            for field_name in get_fields_dict(cls):
-                if is_auto_field(cls, field_name):  # type: ignore[arg-type]
-                    auto_fields.add(field_name)
-            field_list = " ".join(sorted(auto_fields))
-        cls_fqn = get_fq_class_name(cls)
-        crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
-
     async def __call__(  # noqa: C901
         self, to_provide: Set[Callable], request: Request, crawler: Crawler
     ) -> Sequence[Any]:
@@ -125,7 +93,6 @@ async def __call__(  # noqa: C901
         http_response = None
         screenshot_requested = Screenshot in to_provide
         for cls in list(to_provide):
-            self._track_auto_fields(crawler, request, cast(type, cls))
             item = self.injector.weak_cache.get(request, {}).get(cls)
             if item:
                 results.append(item)

diff --git a/setup.cfg b/setup.cfg
@@ -4,5 +4,7 @@ max-line-length = 88
 max-complexity = 18
 select = B,C,E,F,W,T4
 per-file-ignores =
+    tests/test_auto_field_stats.py: E402
     tests/test_providers.py: E402
     scrapy_zyte_api/__init__.py: F401
+    scrapy_zyte_api/poet.py: F401
diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@ def get_version():
             "andi>=0.6.0",
             "scrapy-poet>=0.22.3",
             "web-poet>=0.17.0",
-            "zyte-common-items>=0.20.0",
+            "zyte-common-items>=0.21.0",
         ]
     },
     classifiers=[

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -42,6 +42,9 @@
     SETTINGS["SCRAPY_POET_PROVIDERS"] = {
         "scrapy_zyte_api.providers.ZyteApiProvider": 1100
     }
+    SETTINGS["ITEM_PIPELINES"] = {
+        "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline": 0
+    }
 SETTINGS_ADDON: SETTINGS_T = {
     "ADDONS": {
         Addon: 500,
@@ -108,6 +111,7 @@ def serialize_settings(settings):
             del result[setting]
     for setting in (
         "DOWNLOADER_MIDDLEWARES",
+        "ITEM_PIPELINES",
         "SCRAPY_POET_PROVIDERS",
         "SPIDER_MIDDLEWARES",
     ):

diff --git a/tests/test_addon.py b/tests/test_addon.py
@@ -24,8 +24,10 @@
     POET = False
     InjectionMiddleware = None
     ZyteApiProvider: Optional[Type] = None
+    ScrapyZyteAPIAutoFieldStatsItemPipeline: Optional[Type] = None
 else:
     POET = True
+    from scrapy_zyte_api.poet import ScrapyZyteAPIAutoFieldStatsItemPipeline
     from scrapy_zyte_api.providers import ZyteApiProvider
 
 _crawler = get_crawler()
@@ -120,6 +122,7 @@ def _test_setting_changes(initial_settings, expected_settings):
     # Test separately settings that copy_to_dict messes up.
     for setting in (
         "DOWNLOADER_MIDDLEWARES",
+        "ITEM_PIPELINES",
         "SCRAPY_POET_PROVIDERS",
         "SPIDER_MIDDLEWARES",
     ):
@@ -145,6 +148,7 @@ def _test_setting_changes(initial_settings, expected_settings):
         "http": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPDownloadHandler",
         "https": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPSDownloadHandler",
     },
+    "ITEM_PIPELINES": {},
     "REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter",
     "SPIDER_MIDDLEWARES": {
         ScrapyZyteAPISpiderMiddleware: 100,
@@ -230,6 +234,9 @@ def test_no_poet_setting_changes(initial_settings, expected_settings):
                     ScrapyZyteAPISessionDownloaderMiddleware: 667,
                     InjectionMiddleware: 543,
                 },
+                "ITEM_PIPELINES": {
+                    ScrapyZyteAPIAutoFieldStatsItemPipeline: 0,
+                },
                 "SCRAPY_POET_PROVIDERS": {
                     ZyteApiProvider: 1100,
                 },

diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py
@@ -650,6 +650,7 @@ async def test_default_params_immutability(setting_key, meta_key, setting, meta)
 async def _test_automap(
     settings, request_kwargs, meta, expected, warnings, caplog, cookie_jar=None
 ):
+    caplog.clear()
     request = Request(url="https://example.com", **request_kwargs)
     request.meta["zyte_api_automap"] = meta
     settings = {**settings, "ZYTE_API_TRANSPARENT_MODE": True}
@@ -694,7 +695,7 @@ async def _test_automap(
         for warning in warnings:
             assert warning in caplog.text
     else:
-        assert not caplog.records
+        assert not caplog.records, caplog.records[0].args
 
 
 @pytest.mark.parametrize(