Skip to content

Commit

Permalink
HTML unescape py2/3 compat - fixes #4
Browse files Browse the repository at this point in the history
  • Loading branch information
mcs07 committed Oct 10, 2016
1 parent 246ec77 commit 8d30d20
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions chemdataextractor/cli/dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from __future__ import division
from __future__ import print_function
import re
import HTMLParser
import sys

import click
Expand All @@ -23,7 +22,14 @@
from ..nlp.tag import DictionaryTagger
from ..nlp.cem import CsDictCemTagger, CiDictCemTagger, STOPLIST, STOP_SUB, STOP_TOKENS

pars = HTMLParser.HTMLParser()

try:
from html import unescape
except ImportError:
from six.moves.html_parser import HTMLParser
unescape = HTMLParser().unescape


NG_RE = re.compile('([\[\(](\d\d?CI|USAN|r?INN|BAN|JAN|USP)(\d\d?CI|USAN|r?INN|BAN|JAN|USP|[:\-,]|spanish|latin)*[\)\]])+$', re.I | re.U)
START_RE = re.compile('^(anhydrous|elemental|amorphous|conjugated|colloidal|activated) ', re.I | re.U)
END_RE = re.compile('[\[\(]((crude )?product|substance|solution|anhydrous|derivative|analog|salt|modified|discontinued|injectable|anesthetic|pharmaceutical|natural|nonionic|european|ester|dye|tablets?|mineral|VAN|hydrolyzed)[\)\]]$', re.I | re.U)
Expand Down Expand Up @@ -122,7 +128,7 @@ def _process_name(name):
"""Fix issues with Jochem names."""

# Unescape HTML entities
name = pars.unescape(name)
name = unescape(name)

# Remove bracketed stuff on the end
name = NG_RE.sub('', name).strip() # Nomenclature groups
Expand Down

0 comments on commit 8d30d20

Please sign in to comment.