Merge pull request #152 from carpedm20/skin_tone_parser

Skin tone parser
carpedm20 · Jan 25, 2021 · 9365b6a · 9365b6a
2 parents ea334f0 + 1a21833
commit 9365b6a
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 56 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,20 +1,23 @@
 emoji
 =====
 
+1.1.1
+-----
+* Emoji extractor refactored 
 
 1.1.0
 -----
-* Added support of Italian naming of emojis
+* Added support for Italian naming of emojis
+* Added Python 3.8 and 3.9 as supported versions
 
 1.0.1
 -----
-* Added Python 3.8 and 3.9 as supported versions
-* Minot bug fixing
+* Bug fixing
 
 1.0.0
 -----
-* Added support of Spanish naming of emojis
-* Added support of Portuguese naming of emojis
+* Added support for Spanish naming of emojis
+* Added support for Portuguese naming of emojis
 * Emoji packs split by language to different modules
 
 0.3.5

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 New BSD License
 
-Copyright (c) 2014-2015, Taehoon Kim and Kevin Wurster
+Copyright (c) 2014-2021, Taehoon Kim and Kevin Wurster
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/emoji/__init__.py b/emoji/__init__.py
@@ -27,7 +27,7 @@
     'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH',
 ]
 
-__version__ = '1.1.0'
+__version__ = '1.1.1'
 __author__ = 'Taehoon Kim and Kevin Wurster'
 __email__ = 'carpedm20@gmail.com'
 # and wursterk@gmail.com

diff --git a/utils/get-codes-from-unicode-consortium.py b/utils/get-codes-from-unicode-consortium.py
@@ -1,5 +1,4 @@
-#!/usr/bin/env python
-
+# -*- coding: utf-8 -*-
 
 """
 Extract the full list of emoji and names from the Unicode Consortium and
@@ -8,64 +7,54 @@
 
 Written and run with Python3.  Not tested on Python2 and definitely not
 intended for production use.
-
-http://www.unicode.org/Public/emoji/1.0/full-emoji-list.html
 """
 
-
-from collections import OrderedDict
-
 import requests
 from bs4 import BeautifulSoup
 
-url = 'http://www.unicode.org/emoji/charts/emoji-list.html'
-
-
-response = requests.get(url)
-response.raise_for_status()
-soup = BeautifulSoup(response.text, 'html.parser')
 
-# with open('utils/content.html') as f:
-#     soup = BeautifulSoup(f.read())
-
-header = [
-    'Count', 'Code', 'Sample', 'Name',
-]
+def extract_emojis(url) -> dict:
+    output = {}
+    response = requests.get(url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.text, 'html.parser')
+    table = soup.find('table')
+    names = [n.text for n in table.find_all(attrs={"class": "name"})]
+    codes = [c.text for c in table.find_all(attrs={"class": "code"})]
+    for name, code in zip(names, codes):
+        """
+        replace semi-colons, commas, open smart quote, close smart quote,
+        and asterisk (⊛) symbol used to denote newly added emojis,
+        replace spaces after trimming for the asterisk case
+        """
+        name = name.removeprefix('flag: ')\
+                   .replace(':', '') \
+                   .replace(',', '') \
+                   .replace(u'\u201c', '') \
+                   .replace(u'\u201d', '') \
+                   .replace(u'\u229b', '') \
+                   .strip() \
+                   .replace(' ', '_')
 
-output = {}
-for row in soup.find('table').find_all('tr'):
-    cols = row.find_all('td')
-    cols = [e.text.strip() for e in cols]
-    d = OrderedDict(zip(header, [e.strip() for e in cols]))
-    if d:
         _code = []
-        for c in d['Code'].split(' '):
+        for c in code.split(' '):
             if len(c) == 6:
-                _code.append(c.replace('+', '0000'))
+                _code.append(c.replace('U+', '\\U0000'))
             else:
-                _code.append(c.replace('+', '000'))
-        code = ''.join(_code)
+                _code.append(c.replace('U+', '\\U000'))
+            code = ''.join(_code)
+        output[name] = code
+    return output
 
-        """
-            replace semi-colons, commas,
-            open smart quote, close smart quote,
-            and asterisk (⊛) symbol used to
-            denote newly added emojis
-
-            replace spaces after trimming for the
-            asterisk case
-        """
 
-        name = d['Name'].replace(':', '') \
-                        .replace(',', '') \
-                        .replace(u'\u201c', '') \
-                        .replace(u'\u201d', '') \
-                        .replace(u'\u229b', '')\
-                        .strip()\
-                        .replace(' ', '_')
+if __name__ == '__main__':
+    emoji_url = 'http://www.unicode.org/emoji/charts/full-emoji-list.html'
+    emoji_modifiers_url = 'http://www.unicode.org/emoji/charts/full-emoji-modifiers.html'
 
-        char = "u'" + code.replace('U', '\\U') + "',"
-        output[name] = char
+    emojis = extract_emojis(emoji_url)
+    emoji_modifiers = extract_emojis(emoji_modifiers_url)
+    total = emojis | emoji_modifiers
 
-for name in sorted(output.keys()):
-    print("    u':%s:': %s" % (name, output[name]))
+    for emoji_name, emoji_code in sorted(total.items()):
+        print(f"    u':{emoji_name}:': u'{emoji_code}'", end=',\n')
+    print('\nTotal count of emojis: ', len(total))  # can be checked here: http://www.unicode.org/emoji/charts/emoji-counts.html