mirror of https://github.com/alantang1977/pg
Update emoji_manager.py
This commit is contained in:
parent
8f002a4eed
commit
750ac3b374
|
@ -4,106 +4,104 @@ import json
|
||||||
import csv
|
import csv
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from collections import deque
|
|
||||||
from typing import Set, List, Union
|
|
||||||
|
|
||||||
# Android-safe emoji sets by categories
|
# Android-safe emoji sets by categories
|
||||||
EMOJI_ANIMALS_NATURE = [
|
EMOJI_POOL = [
|
||||||
"🐶", "🐱", "🐭", "🐹", "🐰", "🦊", "🐻", "🐼", "🐨", "🐯",
|
"🐶", "🐱", "🐭", "🐹", "🐰", "🦊", "🐻", "🐼", "🐨", "🐯",
|
||||||
"🦁", "🐮", "🐷", "🐸", "🐵", "🦄", "🐔", "🐧", "🐦", "🐤"
|
"🦁", "🐮", "🐷", "🐸", "🐵", "🦄", "🐔", "🐧", "🐦", "🐤",
|
||||||
]
|
|
||||||
EMOJI_FOOD_DRINK = [
|
|
||||||
"🍏", "🍎", "🍐", "🍊", "🍋", "🍌", "🍉", "🍇", "🍓", "🫐",
|
"🍏", "🍎", "🍐", "🍊", "🍋", "🍌", "🍉", "🍇", "🍓", "🫐",
|
||||||
"🍈", "🍒", "🍑", "🥭", "🍍", "🥥", "🥝", "🍅", "🍆", "🥑"
|
"🍈", "🍒", "🍑", "🥭", "🍍", "🥥", "🥝", "🍅", "🍆", "🥑",
|
||||||
]
|
|
||||||
EMOJI_ACTIVITY = [
|
|
||||||
"⚽", "🏀", "🏈", "⚾", "🎾", "🏐", "🏉", "🎱", "🏓", "🏸",
|
"⚽", "🏀", "🏈", "⚾", "🎾", "🏐", "🏉", "🎱", "🏓", "🏸",
|
||||||
"🥅", "🏒", "🏑", "🏏", "⛳", "🏹", "🎣", "🤿", "🥊", "🥋"
|
"🥅", "🏒", "🏑", "🏏", "⛳", "🏹", "🎣", "🤿", "🥊", "🥋",
|
||||||
]
|
|
||||||
EMOJI_OBJECTS = [
|
|
||||||
"⌚", "📱", "💻", "🖨️", "🕹️", "🎮", "📷", "📸", "📹", "🎥",
|
"⌚", "📱", "💻", "🖨️", "🕹️", "🎮", "📷", "📸", "📹", "🎥",
|
||||||
"📺", "📻", "🎙️", "🎚️", "🎛️", "☎️", "📞", "📟", "📠", "🔋"
|
"📺", "📻", "🎙️", "🎚️", "🎛️", "☎️", "📞", "📟", "📠", "🔋",
|
||||||
]
|
|
||||||
EMOJI_PLACES = [
|
|
||||||
"🚗", "🚕", "🚙", "🚌", "🚎", "🏎️", "🚓", "🚑", "🚒", "🚐",
|
"🚗", "🚕", "🚙", "🚌", "🚎", "🏎️", "🚓", "🚑", "🚒", "🚐",
|
||||||
"🛻", "🚚", "🚛", "🚜", "🏍️", "🛵", "🚲", "🛴", "🚨", "🚔"
|
"🛻", "🚚", "🚛", "🚜", "🏍️", "🛵", "🚲", "🛴", "🚨", "🚔"
|
||||||
]
|
]
|
||||||
|
|
||||||
ANDROID_EMOJI_POOL = (
|
# Regex pattern for emoji (cover most cases, including ZWJ sequences)
|
||||||
EMOJI_ANIMALS_NATURE +
|
|
||||||
EMOJI_FOOD_DRINK +
|
|
||||||
EMOJI_ACTIVITY +
|
|
||||||
EMOJI_OBJECTS +
|
|
||||||
EMOJI_PLACES
|
|
||||||
)
|
|
||||||
|
|
||||||
# Regex pattern for emoji (wide, but not perfect!)
|
|
||||||
EMOJI_REGEX = re.compile(
|
EMOJI_REGEX = re.compile(
|
||||||
"["
|
r"("
|
||||||
"\U0001F300-\U0001F5FF"
|
r"(?:[\U0001F3FB-\U0001F3FF])|" # skin tone
|
||||||
"\U0001F600-\U0001F64F"
|
r"(?:[\U0001F9B0-\U0001F9B3])|" # hair
|
||||||
"\U0001F680-\U0001F6FF"
|
r"(?:[\U0001F1E6-\U0001F1FF]{2})|" # flags
|
||||||
"\U0001F700-\U0001F77F"
|
r"(?:[\U0001F600-\U0001F64F])|"
|
||||||
"\U0001F780-\U0001F7FF"
|
r"(?:[\U0001F300-\U0001F5FF])|"
|
||||||
"\U0001F800-\U0001F8FF"
|
r"(?:[\U0001F680-\U0001F6FF])|"
|
||||||
"\U0001F900-\U0001F9FF"
|
r"(?:[\U0001F700-\U0001F77F])|"
|
||||||
"\U0001FA00-\U0001FA6F"
|
r"(?:[\U0001F780-\U0001F7FF])|"
|
||||||
"\U0001FA70-\U0001FAFF"
|
r"(?:[\U0001F800-\U0001F8FF])|"
|
||||||
"\U00002702-\U000027B0"
|
r"(?:[\U0001F900-\U0001F9FF])|"
|
||||||
"\U000024C2-\U0001F251"
|
r"(?:[\U0001FA00-\U0001FA6F])|"
|
||||||
"]+", flags=re.UNICODE
|
r"(?:[\U0001FA70-\U0001FAFF])|"
|
||||||
|
r"(?:[\u2600-\u26FF])|"
|
||||||
|
r"(?:[\u2700-\u27BF])|"
|
||||||
|
r"(?:[\u2300-\u23FF])|"
|
||||||
|
r"(?:[\u2B05-\u2B07])|"
|
||||||
|
r"(?:\u200D)" # zwj joiner
|
||||||
|
r")+",
|
||||||
|
re.UNICODE
|
||||||
)
|
)
|
||||||
|
|
||||||
SUPPORTED_EXTS = (".json", ".txt", ".md", ".csv", ".xml", ".html", ".htm")
|
SUPPORTED_EXTS = (".json", ".txt", ".md", ".csv", ".xml", ".html", ".htm")
|
||||||
|
|
||||||
def extract_emojis(text: str) -> List[str]:
|
def extract_emojis(text):
|
||||||
return EMOJI_REGEX.findall(text)
|
# Find all emoji/sequence, keeping order, including duplicates
|
||||||
|
return [m.group(0) for m in EMOJI_REGEX.finditer(text)]
|
||||||
|
|
||||||
def build_emoji_map(original_emojis: List[str], emoji_pool: List[str]) -> dict:
|
def find_duplicates(emoji_list):
|
||||||
"""
|
# Return a set of emojis that occur more than once
|
||||||
Map each original emoji (in order of appearance) to a replacement, cyclic by pool.
|
from collections import Counter
|
||||||
The mapping is 1-to-1 and only for those emojis that existed in the original file.
|
count = Counter(emoji_list)
|
||||||
"""
|
return {em for em, c in count.items() if c > 1}
|
||||||
emoji_map = dict()
|
|
||||||
pool = deque(emoji_pool)
|
def build_emoji_replace_map(emoji_list, emoji_pool):
|
||||||
for emoji in original_emojis:
|
# Only map duplicate emojis; unique ones stay unchanged
|
||||||
if emoji not in emoji_map:
|
from collections import Counter, deque
|
||||||
new_emoji = pool.popleft()
|
|
||||||
emoji_map[emoji] = new_emoji
|
counter = Counter(emoji_list)
|
||||||
pool.append(new_emoji)
|
dups = [em for em in emoji_list if counter[em] > 1]
|
||||||
|
seen = set()
|
||||||
|
dups_unique = []
|
||||||
|
for em in dups:
|
||||||
|
if em not in seen:
|
||||||
|
dups_unique.append(em)
|
||||||
|
seen.add(em)
|
||||||
|
pool = deque([e for e in emoji_pool if e not in emoji_list]) # don't use emojis already present
|
||||||
|
emoji_map = {}
|
||||||
|
for em in dups_unique:
|
||||||
|
if not pool:
|
||||||
|
pool = deque([e for e in emoji_pool if e not in emoji_list])
|
||||||
|
if pool:
|
||||||
|
emoji_map[em] = pool.popleft()
|
||||||
return emoji_map
|
return emoji_map
|
||||||
|
|
||||||
def replace_emojis(text: str, emoji_map: dict) -> str:
|
def replace_duplicate_emojis(text, emoji_map):
|
||||||
# Only replace emojis that exist in the original, keep positions unchanged.
|
# Only replace duplicates (the 2nd, 3rd, ...) occurrence for each
|
||||||
def replace_fn(match):
|
# First occurrence stays, others replaced
|
||||||
emoji = match.group(0)
|
matches = list(EMOJI_REGEX.finditer(text))
|
||||||
return emoji_map.get(emoji, emoji)
|
new_text = []
|
||||||
return EMOJI_REGEX.sub(replace_fn, text)
|
last_idx = 0
|
||||||
|
seen = {}
|
||||||
|
for m in matches:
|
||||||
|
em = m.group(0)
|
||||||
|
new_text.append(text[last_idx:m.start()])
|
||||||
|
seen.setdefault(em, 0)
|
||||||
|
seen[em] += 1
|
||||||
|
if em in emoji_map and seen[em] > 1:
|
||||||
|
new_text.append(emoji_map[em])
|
||||||
|
else:
|
||||||
|
new_text.append(em)
|
||||||
|
last_idx = m.end()
|
||||||
|
new_text.append(text[last_idx:])
|
||||||
|
return ''.join(new_text)
|
||||||
|
|
||||||
def get_emojis_in_order(text: str) -> List[str]:
|
def process_json_file(src, dst, emoji_map):
|
||||||
# Keep duplicates and order of appearance, only unique in mapping
|
|
||||||
seen = set()
|
|
||||||
ordered = []
|
|
||||||
for em in EMOJI_REGEX.findall(text):
|
|
||||||
if em not in seen:
|
|
||||||
ordered.append(em)
|
|
||||||
seen.add(em)
|
|
||||||
return ordered
|
|
||||||
|
|
||||||
def load_file(filepath: str) -> str:
|
|
||||||
with open(filepath, "r", encoding="utf-8") as f:
|
|
||||||
return f.read()
|
|
||||||
|
|
||||||
def save_file(filepath: str, content: str):
|
|
||||||
with open(filepath, "w", encoding="utf-8") as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
def process_json_file(src: str, dst: str, emoji_map: dict):
|
|
||||||
with open(src, "r", encoding="utf-8") as f:
|
with open(src, "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
def recursive_replace(obj):
|
def recursive_replace(obj):
|
||||||
if isinstance(obj, str):
|
if isinstance(obj, str):
|
||||||
return replace_emojis(obj, emoji_map)
|
return replace_duplicate_emojis(obj, emoji_map)
|
||||||
elif isinstance(obj, list):
|
elif isinstance(obj, list):
|
||||||
return [recursive_replace(item) for item in obj]
|
return [recursive_replace(item) for item in obj]
|
||||||
elif isinstance(obj, dict):
|
elif isinstance(obj, dict):
|
||||||
|
@ -113,33 +111,35 @@ def process_json_file(src: str, dst: str, emoji_map: dict):
|
||||||
with open(dst, "w", encoding="utf-8") as f:
|
with open(dst, "w", encoding="utf-8") as f:
|
||||||
json.dump(new_data, f, ensure_ascii=False, indent=2)
|
json.dump(new_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
def process_csv_file(src: str, dst: str, emoji_map: dict):
|
def process_csv_file(src, dst, emoji_map):
|
||||||
with open(src, "r", encoding="utf-8", newline='') as f:
|
with open(src, "r", encoding="utf-8", newline='') as f:
|
||||||
reader = list(csv.reader(f))
|
reader = list(csv.reader(f))
|
||||||
new_rows = []
|
new_rows = []
|
||||||
for row in reader:
|
for row in reader:
|
||||||
new_row = [replace_emojis(cell, emoji_map) for cell in row]
|
new_row = [replace_duplicate_emojis(cell, emoji_map) for cell in row]
|
||||||
new_rows.append(new_row)
|
new_rows.append(new_row)
|
||||||
with open(dst, "w", encoding="utf-8", newline='') as f:
|
with open(dst, "w", encoding="utf-8", newline='') as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
writer.writerows(new_rows)
|
writer.writerows(new_rows)
|
||||||
|
|
||||||
def process_txt_file(src: str, dst: str, emoji_map: dict):
|
def process_txt_file(src, dst, emoji_map):
|
||||||
text = load_file(src)
|
with open(src, "r", encoding="utf-8") as f:
|
||||||
new_text = replace_emojis(text, emoji_map)
|
text = f.read()
|
||||||
save_file(dst, new_text)
|
new_text = replace_duplicate_emojis(text, emoji_map)
|
||||||
|
with open(dst, "w", encoding="utf-8") as f:
|
||||||
|
f.write(new_text)
|
||||||
|
|
||||||
def process_md_file(src: str, dst: str, emoji_map: dict):
|
def process_md_file(src, dst, emoji_map):
|
||||||
process_txt_file(src, dst, emoji_map)
|
process_txt_file(src, dst, emoji_map)
|
||||||
|
|
||||||
def process_xml_file(src: str, dst: str, emoji_map: dict):
|
def process_xml_file(src, dst, emoji_map):
|
||||||
tree = ET.parse(src)
|
tree = ET.parse(src)
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
def recursive_xml(elem):
|
def recursive_xml(elem):
|
||||||
if elem.text:
|
if elem.text:
|
||||||
elem.text = replace_emojis(elem.text, emoji_map)
|
elem.text = replace_duplicate_emojis(elem.text, emoji_map)
|
||||||
if elem.tail:
|
if elem.tail:
|
||||||
elem.tail = replace_emojis(elem.tail, emoji_map)
|
elem.tail = replace_duplicate_emojis(elem.tail, emoji_map)
|
||||||
for child in elem:
|
for child in elem:
|
||||||
recursive_xml(child)
|
recursive_xml(child)
|
||||||
recursive_xml(root)
|
recursive_xml(root)
|
||||||
|
@ -156,7 +156,7 @@ class MyHTMLParser(HTMLParser):
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
self.result.append(f"</{tag}>")
|
self.result.append(f"</{tag}>")
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
self.result.append(replace_emojis(data, self.emoji_map))
|
self.result.append(replace_duplicate_emojis(data, self.emoji_map))
|
||||||
def handle_entityref(self, name):
|
def handle_entityref(self, name):
|
||||||
self.result.append(f"&{name};")
|
self.result.append(f"&{name};")
|
||||||
def handle_charref(self, name):
|
def handle_charref(self, name):
|
||||||
|
@ -164,11 +164,13 @@ class MyHTMLParser(HTMLParser):
|
||||||
def get_html(self):
|
def get_html(self):
|
||||||
return "".join(self.result)
|
return "".join(self.result)
|
||||||
|
|
||||||
def process_html_file(src: str, dst: str, emoji_map: dict):
|
def process_html_file(src, dst, emoji_map):
|
||||||
text = load_file(src)
|
with open(src, "r", encoding="utf-8") as f:
|
||||||
|
text = f.read()
|
||||||
parser = MyHTMLParser(emoji_map)
|
parser = MyHTMLParser(emoji_map)
|
||||||
parser.feed(text)
|
parser.feed(text)
|
||||||
save_file(dst, parser.get_html())
|
with open(dst, "w", encoding="utf-8") as f:
|
||||||
|
f.write(parser.get_html())
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
src_dir = os.path.join(os.path.dirname(__file__))
|
src_dir = os.path.join(os.path.dirname(__file__))
|
||||||
|
@ -182,13 +184,13 @@ def main():
|
||||||
src_path = os.path.join(src_dir, filename)
|
src_path = os.path.join(src_dir, filename)
|
||||||
dst_path = os.path.join(output_dir, filename)
|
dst_path = os.path.join(output_dir, filename)
|
||||||
print(f"Processing {filename}")
|
print(f"Processing {filename}")
|
||||||
# Extract all emojis in file, in order
|
with open(src_path, "r", encoding="utf-8") as f:
|
||||||
text = load_file(src_path)
|
text = f.read()
|
||||||
original_emojis = get_emojis_in_order(text)
|
emoji_list = extract_emojis(text)
|
||||||
if not original_emojis:
|
if not emoji_list:
|
||||||
print(f" No emojis found in {filename}, skip.")
|
print(f" No emojis found in {filename}, skip.")
|
||||||
continue
|
continue
|
||||||
emoji_map = build_emoji_map(original_emojis, ANDROID_EMOJI_POOL)
|
emoji_map = build_emoji_replace_map(emoji_list, EMOJI_POOL)
|
||||||
ext = os.path.splitext(filename)[-1].lower()
|
ext = os.path.splitext(filename)[-1].lower()
|
||||||
if ext == ".json":
|
if ext == ".json":
|
||||||
process_json_file(src_path, dst_path, emoji_map)
|
process_json_file(src_path, dst_path, emoji_map)
|
||||||
|
|
Loading…
Reference in New Issue