Skip to content

Commit

Permalink
move typestubs -> wikitextprocessor/
Browse files Browse the repository at this point in the history
  • Loading branch information
kristian-clausal committed Jun 29, 2023
1 parent dbdbe61 commit 811e656
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 56 deletions.
130 changes: 74 additions & 56 deletions wikitextprocessor/dumpparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

from collections.abc import Callable
from pathlib import Path
from typing import Optional, Set, List, IO, TYPE_CHECKING, Protocol
from typing import (Optional, Set, List, IO, TYPE_CHECKING, Protocol, Dict,
Union,)
import unicodedata

if TYPE_CHECKING:
Expand Down Expand Up @@ -45,62 +46,75 @@ def process_input(
# process to maximize concurrency). This requires the ``buffer`` program.
from lxml import etree

if path.endswith(".bz2"):
bzcat_command = (
"lbzcat" if shutil.which("lbzcat") is not None else "bzcat"
)
subp = subprocess.Popen([bzcat_command, path], stdout=subprocess.PIPE)
wikt_f = subp.stdout
else:
wikt_f = open(path, "rb")

if not wikt_f:
logging.error("File or stdout is None??")
return

namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
namespaces = {None: namespace_str}

page_nums = 0
for _, page_element in etree.iterparse(
wikt_f, tag=f"{{{namespace_str}}}page"
):
title = page_element.findtext("title", "", namespaces)
namespace_id = int(page_element.findtext("ns", "0", namespaces))
if (
namespace_id not in namespace_ids
or title.endswith("/documentation")
or "/testcases" in title
):
page_element.clear(keep_tail=True)
continue

text = None
redirect_to = None
model = page_element.findtext("revision/model", "", namespaces)
if (
redirect_element := page_element.find(
"redirect", namespaces=namespaces
def pick_stream() -> Union[IO[bytes], None]:
if path.endswith(".bz2"):
bzcat_command: str = (
"lbzcat" if shutil.which("lbzcat") is not None else "bzcat"
)
) is not None:
redirect_to = redirect_element.get("title", "")
subp: subprocess.Popen[bytes] = \
subprocess.Popen([bzcat_command, path], stdout=subprocess.PIPE)
return subp.stdout
else:
if model not in {"wikitext", "Scribunto", "json"}:
# ignore css, javascript and sanitized-css pages
page_element.clear(keep_tail=True)
continue
text = page_element.findtext("revision/text", "", namespaces)
return open(path, "rb")

page_cb(
title, namespace_id, body=text, redirect_to=redirect_to, model=model
)
page_element.clear(keep_tail=True)
page_nums += 1
if page_nums % 10000 == 0:
logging.info(f" ... {page_nums} raw pages collected")
with pick_stream() as wikt_f:
if not wikt_f:
logging.error("File or stdout is None??")
return

wikt_f.close()
namespace_str: str = "http://www.mediawiki.org/xml/export-0.10/"
namespaces: Dict[None, str] = {None: namespace_str}

page_nums: int = 0
page_element: etree._Element # preannotate to make type-checker happy
for _, page_element in etree.iterparse(
wikt_f, tag=f"{{{namespace_str}}}page"
):
title: str = page_element.findtext("title", "", namespaces)
namespace_id: int = int(page_element.findtext("ns",
"0",
namespaces)
)
if (namespace_id not in namespace_ids
or title.endswith("/documentation")
or "/testcases" in title
):
page_element.clear(keep_tail=True)
continue

text: Optional[str] = None
redirect_to: Optional[str] = None
model: Optional[str] = page_element.findtext("revision/model",
"",
namespaces)
redirect_element: Optional[etree._Element] # can't annotate walrus
if (
redirect_element := page_element.find(
"redirect", namespaces=namespaces
)
) is not None:
redirect_to = redirect_element.get("title", "")
# redirect_to existing implies a redirection, but having a
# .get default to "" is a bit weird: redirect to empty string?
# But you can't use None either..?
else:
if model not in {"wikitext", "Scribunto", "json"}:
# ignore css, javascript and sanitized-css pages
page_element.clear(keep_tail=True)
continue
text = page_element.findtext("revision/text", "", namespaces)

page_cb(
title,
namespace_id,
body=text,
redirect_to=redirect_to,
model=model
)
page_element.clear(keep_tail=True)
page_nums += 1
if page_nums % 10000 == 0:
logging.info(f" ... {page_nums} raw pages collected")

def process_dump(
ctx: "Wtp",
Expand Down Expand Up @@ -230,17 +244,21 @@ def replace_invalid_windows_characters(s: str)-> str:
return s

def save_pages_to_file(ctx: "Wtp", directory: Path) -> None:
on_windows: bool = path_is_on_windows_partition(path)
name_max_length: int = os.pathconf("/", "PC_NAME_MAX")
page: Page
for page in ctx.get_all_pages():
title = replace_invalid_substrings(page.title)
if(path_is_on_windows_partition(directory)):
title: str = replace_invalid_substrings(page.title)
if on_windows:
title = replace_invalid_windows_characters(title)

if page.namespace_id == 0:
file_path = directory.joinpath(f"Words/{title[0:2]}/{title}.txt")
file_path: Path = directory.joinpath(
f"Words/{title[0:2]}/{title}.txt")
else:
file_path = directory.joinpath(f'{title.replace(":", "/", 1)}.txt')

if len(file_path.name.encode()) > os.pathconf("/", "PC_NAME_MAX"):
if len(file_path.name.encode()) > name_max_length:
file_path = file_path.with_stem(
file_path.stem[:50]
+ "_"
Expand Down
File renamed without changes.

5 comments on commit 811e656

@xxyzz
Copy link
Collaborator

@xxyzz xxyzz commented on 811e656 Jun 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could use the src-layout to let setuptools discovery the package code automatically. The wiktextract project's pyproject.toml file uses the "custom discovery" feature, but every time a new directory is created the pyproject.toml file needs to be updated.

@kristian-clausal
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error messages that setuptools gives for the layout stuff are so bad. I've had it happen twice, and I'd already forgotten what the cryptic message about flat layout meant (which was, don't have extra folders in the same directory as your package source folders...), so it took ages to figure out again.

Moving things over into a src directory is probably going to bite us in the ass one way or another (I just know it), so let's keep things as they are and add appropriate package names into pyproject.toml. I'll see if I can figure it out.

@kristian-clausal
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pip install ... -e . worked out, make test passed (as you'd expect), committing.

@xxyzz
Copy link
Collaborator

@xxyzz xxyzz commented on 811e656 Jun 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you using pylint? Because this commit adds some unnecessary types to local variables, like namespace_id: int = int(page_element.findtext. mypy shouldn't throw error if : int is not added becuase it can tell the variable type from the function's return type. Maybe you're using pylint? pylint's default rules are too strict, and each lint feature needs to be enabled manually. I'd recommend Ruff, it's default settings are more reasonable.

@kristian-clausal
Copy link
Collaborator Author

@kristian-clausal kristian-clausal commented on 811e656 Jun 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm adding the type-hints manually. That one's a bit silly, I admit. EDIT: I had to check to see whether int() throws and error or returns None, so it's not always obvious whether it's Optional[int] or int, and yes, it's a bit silly, but also explicit.

The commit failed because I had uncommitted changes I didn't realize fixed an issue with tests (which is why my make test passed while github's failed).

Please sign in to comment.