BluechipTechnologiesAsia commited on
Commit
3d65a6d
·
verified ·
1 Parent(s): d397c6e

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +18 -0
  2. app.py +48 -0
  3. functions.py +50 -0
  4. requirements.txt +9 -0
  5. templates/index.html +17 -0
  6. templates/result.html +14 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN useradd -m -u 1000 user
10
+ USER user
11
+ ENV HOME=/home/user \
12
+ PATH=/home/user/.local/bin:$PATH
13
+
14
+ WORKDIR $HOME/app
15
+
16
+ COPY --chown=user . $HOME/app
17
+
18
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ from PIL import Image
3
+ from io import BytesIO
4
+ import os
5
+ from functions import generate_image_caption, convert_text_to_speech
6
+ import soundfile as sf
7
+ from transformers import AutoTokenizer
8
+ from parler_tts import ParlerTTSForConditionalGeneration
9
+
10
+ app = Flask(__name__)
11
+
12
+ UPLOAD_FOLDER = 'static/uploads'
13
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
14
+
15
+ device = "cpu"
16
+ model_name = "parler-tts/parler_tts_mini_v0.1"
17
+
18
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+
21
+ @app.route('/')
22
+ def index():
23
+ return render_template('index.html')
24
+
25
+
26
+ @app.route('/', methods=['POST'])
27
+ def upload_image():
28
+ uploaded_file = request.files['image']
29
+ if uploaded_file.filename != '':
30
+ image_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
31
+ uploaded_file.save(image_path)
32
+
33
+ caption = generate_image_caption(image_path)
34
+
35
+ audio_arr = convert_text_to_speech(caption)
36
+ sf.write("static/audio.wav", audio_arr, model.config_class.sampling_rate)
37
+
38
+ img = Image.open(image_path)
39
+ img_io = BytesIO()
40
+ img.save(img_io, 'PNG')
41
+ img_encoded = img_io.getvalue().encode('base64')
42
+
43
+ return render_template('result.html', caption=caption, audio_path="audio.wav", image=img_encoded)
44
+ else:
45
+ return render_template('index.html', message="No image uploaded!")
46
+
47
+ if __name__ == '__main__':
48
+ app.run(debug=True)
functions.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
3
+ from parler_tts import ParlerTTSForConditionalGeneration
4
+ import torch
5
+ import soundfile as sf
6
+
7
+ def generate_image_caption(image_path):
8
+
9
+ model_name = "microsoft/Florence-2-large"
10
+ prompt = "<OD>"
11
+
12
+ model = AutoModelForCausalLM.from_pretrained(model_name)
13
+ processor = AutoProcessor.from_pretrained(model_name)
14
+
15
+ image = Image.open(image_path)
16
+
17
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
18
+
19
+ generated_ids = model.generate(
20
+ input_ids=inputs["input_ids"],
21
+ pixel_values=inputs["pixel_values"],
22
+ max_new_tokens=1024,
23
+ num_beams=3,
24
+ do_sample=False
25
+ )
26
+
27
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
28
+ caption = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
29
+
30
+ return caption
31
+
32
+
33
+
34
+
35
+ def convert_text_to_speech(text, device="cpu"):
36
+
37
+ model_name = "parler-tts/parler_tts_mini_v0.1"
38
+
39
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
41
+
42
+ description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
43
+
44
+ input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
45
+ prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
46
+
47
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
48
+ audio_arr = generation.cpu().numpy().squeeze()
49
+
50
+ return audio_arr
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Flask
2
+ Flask-WTF
3
+ Pillow
4
+ io
5
+ soundfile
6
+ transformers
7
+ Tensorflow
8
+ gunicorn
9
+ git+https://github.com/huggingface/parler-tts.git
templates/index.html ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Image Audio Description</title>
5
+ </head>
6
+ <body>
7
+ <h1>Upload an Image</h1>
8
+ <form method="POST" enctype="multipart/form-data">
9
+ <input type="file" name="image" accept="image/*">
10
+ <br>
11
+ <input type="submit" value="Generate audio description">
12
+ </form>
13
+ {% if message %}
14
+ <p style="color: red;">{{ message }}</p>
15
+ {% endif %}
16
+ </body>
17
+ </html>
templates/result.html ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Image Audio Description</title>
5
+ </head>
6
+ <body>
7
+ <h1>Image Caption</h1>
8
+ <p>{{ caption }}</p>
9
+ <img src="data:image/png;base64,{{ image }}" alt="Uploaded Image"> <br>
10
+ <audio controls>
11
+ <source src="{{ audio_path }}" type="audio/wav">
12
+ </audio>
13
+ </body>
14
+ </html>