Merge pull request #5 from hiepph/text-emojize-example
Browse filesAdd an example: Output emoji visualization from a single text input
- examples/README.md +32 -24
- examples/text_emojize.py +63 -0
examples/README.md
CHANGED
|
@@ -1,31 +1,39 @@
|
|
| 1 |
# torchMoji examples
|
| 2 |
|
| 3 |
-
## Initialization
|
| 4 |
-
[create_twitter_vocab.py](create_twitter_vocab.py)
|
| 5 |
-
Create a new vocabulary from a tsv file.
|
| 6 |
-
|
| 7 |
-
[tokenize_dataset.py](tokenize_dataset.py)
|
| 8 |
-
Tokenize a given dataset using the prebuilt vocabulary.
|
| 9 |
-
|
| 10 |
-
[vocab_extension.py](vocab_extension.py)
|
| 11 |
-
Extend the given vocabulary using dataset-specific words.
|
| 12 |
-
|
| 13 |
-
[dataset_split.py](dataset_split.py)
|
| 14 |
Split a given dataset into training, validation and testing.
|
| 15 |
-
|
| 16 |
-
## Use pretrained model/architecture
|
| 17 |
-
[score_texts_emojis.py](score_texts_emojis.py)
|
| 18 |
-
Use torchMoji to score texts for emoji distribution.
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis.
|
| 22 |
|
| 23 |
## Transfer learning
|
| 24 |
-
[finetune_youtube_last.py](finetune_youtube_last.py)
|
| 25 |
-
Finetune the model on the SS-Youtube dataset using the 'last' method.
|
| 26 |
-
|
| 27 |
-
[finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py)
|
| 28 |
-
Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method.
|
| 29 |
-
|
| 30 |
-
[finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py)
|
| 31 |
-
Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric.
|
|
|
|
| 1 |
# torchMoji examples
|
| 2 |
|
| 3 |
+
## Initialization
|
| 4 |
+
[create_twitter_vocab.py](create_twitter_vocab.py)
|
| 5 |
+
Create a new vocabulary from a tsv file.
|
| 6 |
+
|
| 7 |
+
[tokenize_dataset.py](tokenize_dataset.py)
|
| 8 |
+
Tokenize a given dataset using the prebuilt vocabulary.
|
| 9 |
+
|
| 10 |
+
[vocab_extension.py](vocab_extension.py)
|
| 11 |
+
Extend the given vocabulary using dataset-specific words.
|
| 12 |
+
|
| 13 |
+
[dataset_split.py](dataset_split.py)
|
| 14 |
Split a given dataset into training, validation and testing.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
## Use pretrained model/architecture
|
| 17 |
+
[score_texts_emojis.py](score_texts_emojis.py)
|
| 18 |
+
Use torchMoji to score texts for emoji distribution.
|
| 19 |
+
|
| 20 |
+
[text_emojize.py](text_emojize.py)
|
| 21 |
+
Use torchMoji to output emoji visualization from a single text input (mapped from `emoji_overview.png`)
|
| 22 |
+
|
| 23 |
+
```sh
|
| 24 |
+
python examples/text_emojize.py --text "I love mom's cooking\!"
|
| 25 |
+
# => I love mom's cooking! 😋 😍 💓 💛 ❤
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
[encode_texts.py](encode_texts.py)
|
| 29 |
Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis.
|
| 30 |
|
| 31 |
## Transfer learning
|
| 32 |
+
[finetune_youtube_last.py](finetune_youtube_last.py)
|
| 33 |
+
Finetune the model on the SS-Youtube dataset using the 'last' method.
|
| 34 |
+
|
| 35 |
+
[finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py)
|
| 36 |
+
Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method.
|
| 37 |
+
|
| 38 |
+
[finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py)
|
| 39 |
+
Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric.
|
examples/text_emojize.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
""" Use torchMoji to predict emojis from a single text input
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import print_function, division, unicode_literals
|
| 7 |
+
import example_helper
|
| 8 |
+
import json
|
| 9 |
+
import csv
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import emoji
|
| 14 |
+
|
| 15 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
| 16 |
+
from torchmoji.model_def import torchmoji_emojis
|
| 17 |
+
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
|
| 18 |
+
|
| 19 |
+
# Emoji map in emoji_overview.png
|
| 20 |
+
EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
|
| 21 |
+
:pensive: :ok_hand: :blush: :heart: :smirk: \
|
| 22 |
+
:grin: :notes: :flushed: :100: :sleeping: \
|
| 23 |
+
:relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
|
| 24 |
+
:sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
|
| 25 |
+
:neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
|
| 26 |
+
:v: :sunglasses: :rage: :thumbsup: :cry: \
|
| 27 |
+
:sleepy: :yum: :triumph: :hand: :mask: \
|
| 28 |
+
:clap: :eyes: :gun: :persevere: :smiling_imp: \
|
| 29 |
+
:sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
|
| 30 |
+
:wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
|
| 31 |
+
:angry: :no_good: :muscle: :facepunch: :purple_heart: \
|
| 32 |
+
:sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')
|
| 33 |
+
|
| 34 |
+
def top_elements(array, k):
|
| 35 |
+
ind = np.argpartition(array, -k)[-k:]
|
| 36 |
+
return ind[np.argsort(array[ind])][::-1]
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
argparser = argparse.ArgumentParser()
|
| 40 |
+
argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
|
| 41 |
+
argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
|
| 42 |
+
args = argparser.parse_args()
|
| 43 |
+
|
| 44 |
+
# Tokenizing using dictionary
|
| 45 |
+
with open(VOCAB_PATH, 'r') as f:
|
| 46 |
+
vocabulary = json.load(f)
|
| 47 |
+
|
| 48 |
+
st = SentenceTokenizer(vocabulary, args.maxlen)
|
| 49 |
+
|
| 50 |
+
# Loading model
|
| 51 |
+
model = torchmoji_emojis(PRETRAINED_PATH)
|
| 52 |
+
# Running predictions
|
| 53 |
+
tokenized, _, _ = st.tokenize_sentences([args.text])
|
| 54 |
+
# Get sentence probability
|
| 55 |
+
prob = model(tokenized)[0]
|
| 56 |
+
|
| 57 |
+
# Top emoji id
|
| 58 |
+
emoji_ids = top_elements(prob, 5)
|
| 59 |
+
|
| 60 |
+
# map to emojis
|
| 61 |
+
emojis = map(lambda x: EMOJIS[x], emoji_ids)
|
| 62 |
+
|
| 63 |
+
print(emoji.emojize("{} {}".format(args.text,' '.join(emojis)), use_aliases=True))
|