- Use case: Helping a visually impaired individual identify things in the picture
pip install transformers
pip install gradio
pip install timm
pip install inflect
pip install phonemizer
sudo apt-get update
sudo apt-get install espeak-ng
pip install py-espeak-ng
from helper import load_image_from_url, render_results_in_image
from transformers import pipeline
od_pipe = pipeline(
"object-detection",
"./models/facebook/detr-resnet-50"
)
Opening the image and using the pipeline
from PIL import Image
raw_image = Image.open('path_to_image..')
raw_image.resize((569,491))
# raw output
pipeline_output = od_pipe(raw_image)
# processing and rendering the raw output
processed_image = render_results_in_image(
raw_image,
pipeline_output
)
print(processed_image)
# You will see the image with things classified inside
Using Gradio
import os
import gradio as gr
def get_pipeline_prediction(pil_image):
# just refactoring the code above into a function
pipeline_output = od_pipe(pil_image)
processed_image = render_results_in_image(
pil_image, pipeline_output
)
return processed_image
# making gradio demo
demo = gr.Interface(
fn=get_pipeline_prediction,
inputs=gr.Image(label="Input image",
type="pil"),
outputs=gr.Image(label="Output image with predicted instances",
type="pil")
)
# `share=True` will provide an online link to access to the demo
demo.launch(
share=True,
server_port=int(os.environ['PORT1'])
)
#close after accesing link to demo
demo.close()
AI powered Audio assistant
- Combine the object detector with a text-to-speech model that will help dictate what is inside the image
from helper import summarize_predictions_natural_language
raw_image = Image.open(
'huggingface_friends.jpg'
)
raw_image.resize(
(284, 245)
)
text = summarize_predictions_natural_language(
pipeline_output
)
print(text)
"""
'In this image, there are two forks three bottles two cups four persons one bowl and one dining table.'
"""
Generating Audio Narration
tts_pipe = pipeline(
"text-to-speech",
model="./models/kakao-enterprise/vits-ljs"
)
narrated_text = tts_pipe(text)
# playing the generated audio
from IPython.display import Audio as IPythonAudio
IPythonAudio(
narrated_text["audio"][0],
rate=narrated_text["sampling_rate"]
)