BluechipTechnologiesAsia's picture
Update app.py
753f9c3 verified
from flask import Flask, render_template, request
from PIL import Image
from io import BytesIO
import torch
import os
from functions import generate_image_caption, convert_text_to_speech
import soundfile as sf
from transformers import AutoTokenizer
from parler_tts import ParlerTTSForConditionalGeneration
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
app = Flask(__name__)
UPLOAD_FOLDER = 'static/uploads'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "parler-tts/parler_tts_mini_v0.1"
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/', methods=['POST'])
def upload_image():
uploaded_file = request.files['image']
if uploaded_file.filename != '':
image_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
uploaded_file.save(image_path)
caption = generate_image_caption(image_path)
audio_arr = convert_text_to_speech(caption)
sf.write("static/audio.wav", audio_arr, model.config_class.sampling_rate)
img = Image.open(image_path)
img_io = BytesIO()
img.save(img_io, 'PNG')
img_encoded = img_io.getvalue().encode('base64')
return render_template('result.html', caption=caption, audio_path="audio.wav", image=img_encoded)
else:
return render_template('index.html', message="No image uploaded!")
if __name__ == '__main__':
app.run(debug=True)