Skip to content

Commit

Permalink
Merge pull request #288 from xxyzz/new_dump_xml_ns
Browse files Browse the repository at this point in the history
Update XML dump file namespace version
  • Loading branch information
kristian-clausal committed Jun 7, 2024
2 parents bad4391 + ebc02f0 commit 6811128
Showing 1 changed file with 7 additions and 11 deletions.
18 changes: 7 additions & 11 deletions src/wikitextprocessor/dumpparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,13 @@ def parse_dump_xml(wtp: "Wtp", dump_path: str, namespace_ids: set[int]) -> None:
from lxml import etree

with decompress_dump_file(dump_path) as p:
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
namespaces = {None: namespace_str}
page_nums = 0
for _, page_element in etree.iterparse(
p.stdout if isinstance(p, subprocess.Popen) else p, # type: ignore
tag=f"{{{namespace_str}}}page",
tag="{*}page",
):
title = page_element.findtext("title", "", namespaces)
namespace_id = int(page_element.findtext("ns", "0", namespaces))
title = page_element.findtext("{*}title", "")
namespace_id = int(page_element.findtext("{*}ns", "0"))
if (
namespace_id not in namespace_ids
or title.endswith("/documentation")
Expand All @@ -64,13 +62,11 @@ def parse_dump_xml(wtp: "Wtp", dump_path: str, namespace_ids: set[int]) -> None:

text: Optional[str] = None
redirect_to: Optional[str] = None
model = page_element.findtext("revision/model", "", namespaces)
model = page_element.findtext("{*}revision/{*}model", "")
if (
redirect_element := page_element.find(
"redirect", namespaces=namespaces
)
redirect_element := page_element.find("{*}redirect")
) is not None:
redirect_to = redirect_element.get("title", "")
redirect_to = redirect_element.get("{*}title", "")
# redirect_to existing implies a redirection, but having a
# .get default to "" is a bit weird: redirect to empty string?
# But you can't use None either..?
Expand All @@ -79,7 +75,7 @@ def parse_dump_xml(wtp: "Wtp", dump_path: str, namespace_ids: set[int]) -> None:
# ignore css, javascript and sanitized-css pages
page_element.clear(keep_tail=True)
continue
text = page_element.findtext("revision/text", "", namespaces)
text = page_element.findtext("{*}revision/{*}text", "")

wtp.add_page(
title,
Expand Down

0 comments on commit 6811128

Please sign in to comment.