Skip to content

Commit

Permalink
Merge pull request #152 from carpedm20/skin_tone_parser
Browse files Browse the repository at this point in the history
Skin tone parser
  • Loading branch information
TahirJalilov committed Jan 25, 2021
2 parents ea334f0 + 1a21833 commit 9365b6a
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 56 deletions.
13 changes: 8 additions & 5 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
emoji
=====

1.1.1
-----
* Emoji extractor refactored

1.1.0
-----
* Added support of Italian naming of emojis
* Added support for Italian naming of emojis
* Added Python 3.8 and 3.9 as supported versions

1.0.1
-----
* Added Python 3.8 and 3.9 as supported versions
* Minot bug fixing
* Bug fixing

1.0.0
-----
* Added support of Spanish naming of emojis
* Added support of Portuguese naming of emojis
* Added support for Spanish naming of emojis
* Added support for Portuguese naming of emojis
* Emoji packs split by language to different modules

0.3.5
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
New BSD License

Copyright (c) 2014-2015, Taehoon Kim and Kevin Wurster
Copyright (c) 2014-2021, Taehoon Kim and Kevin Wurster
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion emoji/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH',
]

__version__ = '1.1.0'
__version__ = '1.1.1'
__author__ = 'Taehoon Kim and Kevin Wurster'
__email__ = 'carpedm20@gmail.com'
# and wursterk@gmail.com
Expand Down
87 changes: 38 additions & 49 deletions utils/get-codes-from-unicode-consortium.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python

# -*- coding: utf-8 -*-

"""
Extract the full list of emoji and names from the Unicode Consortium and
Expand All @@ -8,64 +7,54 @@
Written and run with Python3. Not tested on Python2 and definitely not
intended for production use.
http://www.unicode.org/Public/emoji/1.0/full-emoji-list.html
"""


from collections import OrderedDict

import requests
from bs4 import BeautifulSoup

url = 'http://www.unicode.org/emoji/charts/emoji-list.html'


response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

# with open('utils/content.html') as f:
# soup = BeautifulSoup(f.read())

header = [
'Count', 'Code', 'Sample', 'Name',
]
def extract_emojis(url) -> dict:
output = {}
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
names = [n.text for n in table.find_all(attrs={"class": "name"})]
codes = [c.text for c in table.find_all(attrs={"class": "code"})]
for name, code in zip(names, codes):
"""
replace semi-colons, commas, open smart quote, close smart quote,
and asterisk (⊛) symbol used to denote newly added emojis,
replace spaces after trimming for the asterisk case
"""
name = name.removeprefix('flag: ')\
.replace(':', '') \
.replace(',', '') \
.replace(u'\u201c', '') \
.replace(u'\u201d', '') \
.replace(u'\u229b', '') \
.strip() \
.replace(' ', '_')

output = {}
for row in soup.find('table').find_all('tr'):
cols = row.find_all('td')
cols = [e.text.strip() for e in cols]
d = OrderedDict(zip(header, [e.strip() for e in cols]))
if d:
_code = []
for c in d['Code'].split(' '):
for c in code.split(' '):
if len(c) == 6:
_code.append(c.replace('+', '0000'))
_code.append(c.replace('U+', '\\U0000'))
else:
_code.append(c.replace('+', '000'))
code = ''.join(_code)
_code.append(c.replace('U+', '\\U000'))
code = ''.join(_code)
output[name] = code
return output

"""
replace semi-colons, commas,
open smart quote, close smart quote,
and asterisk (⊛) symbol used to
denote newly added emojis
replace spaces after trimming for the
asterisk case
"""

name = d['Name'].replace(':', '') \
.replace(',', '') \
.replace(u'\u201c', '') \
.replace(u'\u201d', '') \
.replace(u'\u229b', '')\
.strip()\
.replace(' ', '_')
if __name__ == '__main__':
emoji_url = 'http://www.unicode.org/emoji/charts/full-emoji-list.html'
emoji_modifiers_url = 'http://www.unicode.org/emoji/charts/full-emoji-modifiers.html'

char = "u'" + code.replace('U', '\\U') + "',"
output[name] = char
emojis = extract_emojis(emoji_url)
emoji_modifiers = extract_emojis(emoji_modifiers_url)
total = emojis | emoji_modifiers

for name in sorted(output.keys()):
print(" u':%s:': %s" % (name, output[name]))
for emoji_name, emoji_code in sorted(total.items()):
print(f" u':{emoji_name}:': u'{emoji_code}'", end=',\n')
print('\nTotal count of emojis: ', len(total)) # can be checked here: http://www.unicode.org/emoji/charts/emoji-counts.html

0 comments on commit 9365b6a

Please sign in to comment.