Use lru_cache on Wtp.get_page()

`get_page()` is kind slow(1s per call), cache the requests improve the performance significantly. This reduces the process time of Chinese Wiktionary from 40 minutes to 10 minutes.
tatuylonen · Aug 21, 2023 · 4b3d963 · 4b3d963
1 parent 4edfb17
commit 4b3d963
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/wikitextprocessor/core.py b/wikitextprocessor/core.py
@@ -15,6 +15,7 @@
 import urllib.parse
 from collections.abc import Sequence
 from dataclasses import dataclass
+from functools import lru_cache
 from pathlib import Path
 from types import TracebackType
 from typing import (
@@ -1683,6 +1684,7 @@ def magic_repl(m: re.Match) -> str:
         # print("    _finalize_expand:{!r}".format(text))
         return text
 
+    @lru_cache
     def get_page(
         self, title: str, namespace_id: Optional[int] = None
     ) -> Optional[Page]:
@@ -1710,7 +1712,10 @@ def get_page(
                 # Add namespace prefix
                 title = ns_prefix + title
 
-        query_str = "SELECT * FROM pages WHERE title = ?"
+        query_str = """
+        SELECT title, namespace_id, redirect_to, need_pre_expand, body, model
+        FROM pages WHERE title = ?
+        """
         if namespace_id is not None:
             query_str += " AND namespace_id = ?"
         query_str += " LIMIT 1"

diff --git a/wikitextprocessor/luaexec.py b/wikitextprocessor/luaexec.py
@@ -219,7 +219,7 @@ def get_page_info(ctx: "Wtp", title: str, namespace_id: int) -> "_LuaTable":
     assert ctx.lua is not None
 
     page_id = 0  # XXX collect required info in phase 1
-    page: Optional["Page"] = ctx.get_page(title, namespace_id)
+    page = ctx.get_page(title, namespace_id)
     # whether the page exists and what its id might be
     dt = {
         "id": page_id,