Skip to content

Commit

Permalink
Fix links followed by hyphen (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
tsutsu3 committed May 7, 2022
1 parent 119af39 commit dfefc3b
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 48 deletions.
90 changes: 42 additions & 48 deletions linkify_it/ucre.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,6 @@
"(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?"
)

SRC_HOST_TERMINATOR = (
"(?=$|"
+ TEXT_SEPARATORS
+ "|"
+ SRC_ZPCC
+ ")(?!-|_|:\\d|\\.-|\\.(?!$|"
+ SRC_ZPCC
+ "))"
)


# Allow anything in markdown spec, forbid quote (") at the first position
# because emails enclosed in quotes are far more common
SRC_EMAIL_NAME = '[\\-:&=\\+\\$,\\.a-zA-Z0-9_][\\-:&=\\+\\$,\\"\\.a-zA-Z0-9_]*'
Expand Down Expand Up @@ -99,15 +88,6 @@

TPL_HOST_NO_IP_FUZZY = "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))"

SRC_HOST_STRICT = SRC_HOST + SRC_HOST_TERMINATOR

TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_HOST_TERMINATOR

SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + SRC_HOST_TERMINATOR

TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + SRC_HOST_TERMINATOR

TPL_HOST_PORT_NO_IP_FUZZY_STRICT = TPL_HOST_NO_IP_FUZZY + SRC_PORT + SRC_HOST_TERMINATOR

# =============================================================================

Expand All @@ -116,35 +96,24 @@
"localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + SRC_ZPCC + "|>|$))"
)

TPL_EMAIL_FUZZY = (
"(^|"
+ TEXT_SEPARATORS
+ '|"|\\(|'
+ SRC_ZCC
+ ")"
+ "("
+ SRC_EMAIL_NAME
+ "@"
+ TPL_HOST_FUZZY_STRICT
+ ")"
)

def _re_host_terminator(opts):
src_host_terminator = (
"(?=$|"
+ TEXT_SEPARATORS
+ "|"
+ SRC_ZPCC
+ ")"
+ "(?!"
+ ("-(?!--)|" if opts.get("---") else "-|")
+ "_|:\\d|\\.-|\\.(?!$|"
+ SRC_ZPCC
+ "))"
)
return src_host_terminator

def _re_src_path(opts):
try:
_ = opts["---"]
# KeyError: Not found key:"---"
# TypeError: opts is None
except (KeyError, TypeError):
long_dash_flag = False
else:
long_dash_flag = True

if long_dash_flag:
options = "\\-(?!--(?:[^-]|$))(?:-*)|" # `---` => long dash, terminate
else:
options = "\\-+|"

def _re_src_path(opts):
src_path = (
"(?:"
+ "[/?#]"
Expand Down Expand Up @@ -184,7 +153,7 @@ def _re_src_path(opts):
+ "\\.(?!"
+ SRC_ZCC
+ "|[.]|$)|"
+ options
+ ("\\-(?!--(?:[^-]|$))(?:-*)|" if opts.get("---") else "\\-+|")
+ ",(?!"
+ SRC_ZCC
+ "|$)|" # allow `,,,` in paths
Expand Down Expand Up @@ -214,6 +183,31 @@ def build_re(opts):
Return:
dict: dict of regex string
"""
SRC_HOST_STRICT = SRC_HOST + _re_host_terminator(opts)

TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + _re_host_terminator(opts)

SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + _re_host_terminator(opts)

TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + _re_host_terminator(opts)

TPL_HOST_PORT_NO_IP_FUZZY_STRICT = (
TPL_HOST_NO_IP_FUZZY + SRC_PORT + _re_host_terminator(opts)
)

TPL_EMAIL_FUZZY = (
"(^|"
+ TEXT_SEPARATORS
+ '|"|\\(|'
+ SRC_ZCC
+ ")"
+ "("
+ SRC_EMAIL_NAME
+ "@"
+ TPL_HOST_FUZZY_STRICT
+ ")"
)

regex = {
"src_Any": SRC_ANY,
"src_Cc": SRC_CC,
Expand All @@ -226,7 +220,7 @@ def build_re(opts):
"src_ip4": SRC_IP4,
"src_auth": SRC_AUTH,
"src_port": SRC_PORT,
"src_host_terminator": SRC_HOST_TERMINATOR,
"src_host_terminator": _re_host_terminator(opts),
"src_path": _re_src_path(opts),
"src_email_name": SRC_EMAIL_NAME,
"src_xn": SRC_XN,
Expand Down
2 changes: 2 additions & 0 deletions test/test_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,10 +264,12 @@ def test_api_shoud_accept_triple_minus():
linkifyit = LinkifyIt()

assert linkifyit.match("http://e.com/foo---bar")[0].text == "http://e.com/foo---bar"
assert linkifyit.match("text@example.com---foo") is None

linkifyit = LinkifyIt(None, {"---": True})

assert linkifyit.match("http://e.com/foo---bar")[0].text == "http://e.com/foo"
assert linkifyit.match("text@example.com---foo")[0].text == "text@example.com"


# issue #25. Schema key containing - not producing matches
Expand Down

0 comments on commit dfefc3b

Please sign in to comment.