Spaces:

BluechipTechnologiesAsia
/

Image-Audio-Description

Runtime error

App Files Files Community

BluechipTechnologiesAsia commited on Jun 27, 2024

Commit

3d65a6d

verified ·

1 Parent(s): d397c6e

Upload 6 files

Browse files

Files changed (6) hide show

Dockerfile +18 -0
app.py +48 -0
functions.py +50 -0
requirements.txt +9 -0
templates/index.html +17 -0
templates/result.html +14 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]

app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from flask import Flask, render_template, request
+from PIL import Image
+from io import BytesIO
+import os
+from functions import generate_image_caption, convert_text_to_speech
+import soundfile as sf
+from transformers import AutoTokenizer
+from parler_tts import ParlerTTSForConditionalGeneration
+app = Flask(__name__)
+UPLOAD_FOLDER = 'static/uploads'
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+device = "cpu"
+model_name = "parler-tts/parler_tts_mini_v0.1"
+model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+@app.route('/')
+def index():
+  return render_template('index.html')
+@app.route('/', methods=['POST'])
+def upload_image():
+  uploaded_file = request.files['image']
+  if uploaded_file.filename != '':
+    image_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
+    uploaded_file.save(image_path)
+    caption = generate_image_caption(image_path)
+    audio_arr = convert_text_to_speech(caption)
+    sf.write("static/audio.wav", audio_arr, model.config_class.sampling_rate)
+    img = Image.open(image_path)
+    img_io = BytesIO()
+    img.save(img_io, 'PNG')
+    img_encoded = img_io.getvalue().encode('base64')
+    return render_template('result.html', caption=caption, audio_path="audio.wav", image=img_encoded)
+  else:
+    return render_template('index.html', message="No image uploaded!")
+if __name__ == '__main__':
+  app.run(debug=True)

functions.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
+from parler_tts import ParlerTTSForConditionalGeneration
+import torch
+import soundfile as sf
+def generate_image_caption(image_path):
+  model_name = "microsoft/Florence-2-large"
+  prompt = "<OD>"
+  model = AutoModelForCausalLM.from_pretrained(model_name)
+  processor = AutoProcessor.from_pretrained(model_name)
+  image = Image.open(image_path)
+  inputs = processor(text=prompt, images=image, return_tensors="pt")
+  generated_ids = model.generate(
+      input_ids=inputs["input_ids"],
+      pixel_values=inputs["pixel_values"],
+      max_new_tokens=1024,
+      num_beams=3,
+      do_sample=False
+  )
+  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+  caption = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
+  return caption
+def convert_text_to_speech(text, device="cpu"):
+  model_name = "parler-tts/parler_tts_mini_v0.1"
+  model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
+  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+  prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
+  generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+  audio_arr = generation.cpu().numpy().squeeze()
+  return audio_arr

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+Flask
+Flask-WTF
+Pillow
+io
+soundfile
+transformers
+Tensorflow
+gunicorn
+git+https://github.com/huggingface/parler-tts.git

templates/index.html ADDED Viewed

	@@ -0,0 +1,17 @@

+<!DOCTYPE html>
+<html>
+<head>
+  <title>Image Audio Description</title>
+</head>
+<body>
+  <h1>Upload an Image</h1>
+  <form method="POST" enctype="multipart/form-data">
+    <input type="file" name="image" accept="image/*">
+    <br>
+    <input type="submit" value="Generate audio description">
+  </form>
+  {% if message %}
+    <p style="color: red;">{{ message }}</p>
+  {% endif %}
+</body>
+</html>

templates/result.html ADDED Viewed

	@@ -0,0 +1,14 @@

+<!DOCTYPE html>
+<html>
+<head>
+  <title>Image Audio Description</title>
+</head>
+<body>
+  <h1>Image Caption</h1>
+  <p>{{ caption }}</p>
+  <img src="data:image/png;base64,{{ image }}" alt="Uploaded Image">  <br>
+  <audio controls>
+    <source src="{{ audio_path }}" type="audio/wav">
+  </audio>
+</body>
+</html>