import sys import time from importlib.metadata import version from gradio.utils import is_zero_gpu_space from gradio.themes import Soft try: import spaces except ImportError: print("ZeroGPU is not available, skipping...") import torch import torchaudio import gradio as gr import torchaudio.transforms as T from transformers import ( AutoModelForCausalLM, AutoTokenizer, AutoProcessor, MoonshineForConditionalGeneration, ) use_zero_gpu = is_zero_gpu_space() use_cuda = torch.cuda.is_available() if use_zero_gpu: spaces_version = version("spaces") print("ZeroGPU is available, changing inference call.") else: spaces_version = "N/A" print("ZeroGPU is not available, skipping...") print(f"Spaces version: {spaces_version}") if use_cuda: print("CUDA is available, setting correct `device` variable.") device = "cuda" torch_dtype = torch.bfloat16 else: device = "cpu" torch_dtype = torch.bfloat16 # Config model_name = "Yehor/kulyk-en-uk" concurrency_limit = 5 # Load the model model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device, torch_dtype=torch_dtype, trust_remote_code=True, ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_name) # Load ASR audio_processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base") audio_model = MoonshineForConditionalGeneration.from_pretrained( "UsefulSensors/moonshine-base", attn_implementation="sdpa" ) audio_model.to(device) audio_model.to(torch_dtype) # Examples examples_text = [ "WP: F-16s are unlikely to make a significant difference on the battlefield", "Missile and 7 of 8 Shaheeds shot down over Ukraine", "Olympic Games 2024. Schedule of competitions for Ukrainian athletes on 28 July", "Harris' campaign raised more than $200 million in less than a week", "Over the week, the NBU sold almost $800 million on the interbank market", "Paris 2024. Day 2: Text broadcast", ] examples_audio = [ "example_1.wav", "example_2.wav", "example_3.wav", "example_4.wav", "example_5.wav", "example_6.wav", "example_7.wav", ] title = "EN-UK Translator" authors_table = """ ## Authors Follow them on social networks and **contact** if you need any help or have any questions: | **Yehor Smoliakov** | |-------------------------------------------------------------------------------------------------| | https://t.me/smlkw in Telegram | | https://x.com/yehor_smoliakov at X | | https://github.com/egorsmkv at GitHub | | https://huggingface.co/Yehor at Hugging Face | | or use egorsmkv@gmail.com | """.strip() description_head = f""" # {title} This space translates your text from English to Ukrainian. Use another spaces: [from Audio](https://huggingface.co/spaces/Yehor/audio-en-uk-translator), [from Images](https://huggingface.co/spaces/Yehor/vision-en-uk-translator). Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out for reverse task. """.strip() translated_text_value = """ Translated text will appear here. Choose an example below the Translate button or type your text. """.strip() tech_env = f""" #### Environment - Python: {sys.version} #### Models - [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk) """.strip() tech_libraries = f""" #### Libraries - torch: {version("torch")} - gradio: {version("gradio")} - transformers: {version("transformers")} """.strip() @spaces.GPU def inference_text(text, progress=gr.Progress()): if not text: raise gr.Error("Please paste your text.") progress(0, desc="Translating...") results = [] sentences = text.split("\n") non_empty_sentences = [] for sentence in sentences: s = sentence.strip() if len(s) != 0: non_empty_sentences.append(s) for sentence in progress.tqdm( non_empty_sentences, desc="Translating...", unit="sentence" ): t0 = time.time() prompt = "Translate the text to Ukrainian:\n" + sentence input_ids = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", tokenize=True, ).to(model.device) output = model.generate( input_ids, max_new_tokens=2048, # Greedy Search do_sample=False, repetition_penalty=1.05, # Sampling # do_sample=True, # temperature=0.1, # # top_k=1, # min_p=0.9, # repetition_penalty=1.05, ) prompt_len = input_ids.shape[1] generated_tokens = output[:, prompt_len:] translated_text = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] elapsed_time = round(time.time() - t0, 2) translated_text = translated_text.strip() results.append( { "sentence": sentence, "translated_text": translated_text, "elapsed_time": elapsed_time, } ) gr.Info("Finished!", duration=2) result_texts = [] for result in results: result_texts.append(f"{result['translated_text']}\n") sum_elapsed_text = sum([result["elapsed_time"] for result in results]) print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds") return "\n".join(result_texts) @spaces.GPU def inference_audio(audio, progress=gr.Progress()): if not audio: raise gr.Error("Please paste your audio file.") progress(0, desc="Translating...") if isinstance(audio, str): audio_array, sr = torchaudio.load(audio) audio_array = audio_array.squeeze() else: audio_array, sr = audio r_sr = audio_processor.feature_extractor.sampling_rate print("Audio processor SR:", r_sr) print("Audio file SR:", sr) if r_sr != sr: print("Resampling...") resampler = T.Resample(orig_freq=sr, new_freq=r_sr) audio_array = resampler(audio_array) inputs = audio_processor(audio_array, return_tensors="pt", sampling_rate=r_sr) inputs = inputs.to(device, dtype=torch_dtype) # to avoid hallucination loops, we limit the maximum length of the generated text based expected number of tokens per second token_limit_factor = ( 6.5 / audio_processor.feature_extractor.sampling_rate ) # Maximum of 6.5 tokens per second seq_lens = inputs.attention_mask.sum(dim=-1) max_length = int((seq_lens * token_limit_factor).max().item()) generated_ids = audio_model.generate(**inputs, max_length=max_length) predictions = audio_processor.batch_decode(generated_ids, skip_special_tokens=True) print("Predictions:", predictions) text = predictions[0] print("Text:", text) results = [] sentences = text.split("\n") non_empty_sentences = [] for sentence in sentences: s = sentence.strip() if len(s) != 0: non_empty_sentences.append(s) for sentence in progress.tqdm( non_empty_sentences, desc="Translating...", unit="sentence" ): t0 = time.time() prompt = "Translate the text to Ukrainian:\n" + sentence input_ids = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", tokenize=True, ).to(model.device) output = model.generate( input_ids, max_new_tokens=2048, # Greedy Search do_sample=False, repetition_penalty=1.05, # Sampling # do_sample=True, # temperature=0.1, # # top_k=1, # min_p=0.9, # repetition_penalty=1.05, ) prompt_len = input_ids.shape[1] generated_tokens = output[:, prompt_len:] translated_text = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] elapsed_time = round(time.time() - t0, 2) translated_text = translated_text.strip() results.append( { "sentence": sentence, "translated_text": translated_text, "elapsed_time": elapsed_time, } ) gr.Info("Finished!", duration=2) result_texts = [] for result in results: result_texts.append(f"> {result['sentence']}") result_texts.append(f"{result['translated_text']}\n") sum_elapsed_text = sum([result["elapsed_time"] for result in results]) result_texts.append(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds") return "\n".join(result_texts) def create_app(): tab = gr.Blocks( title=title, analytics_enabled=False, theme=Soft(), ) with tab: gr.Markdown(description_head) gr.Markdown("## Usage") translated_text = gr.Textbox( label="Translated text", placeholder=translated_text_value, show_copy_button=True, ) text = gr.Textbox(label="Text", autofocus=True, lines=5) gr.Button("Translate").click( inference_text, concurrency_limit=concurrency_limit, inputs=text, outputs=translated_text, ) with gr.Row(): gr.Examples(label="Choose an example", inputs=text, examples=examples_text) return tab def create_audio_app(): with gr.Blocks(theme=Soft()) as tab: gr.Markdown(description_head) gr.Markdown("## Usage") translated_text = gr.Textbox( label="Translated text", placeholder=translated_text_value, show_copy_button=True, ) audio = gr.Audio(label="Audio file", sources="upload", type="filepath") gr.Button("Translate").click( inference_audio, concurrency_limit=concurrency_limit, inputs=audio, outputs=translated_text, ) with gr.Row(): gr.Examples(label="Choose an example", inputs=audio, examples=examples_text) return tab def create_env(): with gr.Blocks(theme=Soft()) as tab: gr.Markdown(tech_env) gr.Markdown(tech_libraries) return tab def create_authors(): with gr.Blocks(theme=Soft()) as tab: gr.Markdown(authors_table) return tab def create_demo(): app_tab = create_app() app_audio_tab = create_audio_app() authors_tab = create_authors() env_tab = create_env() return gr.TabbedInterface( [app_tab, app_audio_tab, authors_tab, env_tab], tab_names=[ "✍️ Translation", "🔊 Audio", "👥 Authors", "📦 Environment, Models, and Libraries", ], ) if __name__ == "__main__": demo = create_demo() demo.queue() demo.launch()