forked from nak2002k/NASA-Space-App-Challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Convert.py
94 lines (64 loc) · 3.38 KB
/
Convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# def img_to_audio(image, time=3.0, rate=22050, n_fft=1024, n_iter=64):
# # Load the image and convert to grayscale
# img = Image.fromarray(image).convert("L")
# # Calculate spectrogram size
# spec_shape = (int(librosa.time_to_frames(time, sr=rate, hop_length=n_fft//2, n_fft=n_fft)), n_fft)
# # Resize the image to match the spectrogram shape
# img = np.asarray(img.resize(spec_shape))
# # Normalize the image
# img = (img - img.min()) / (img.max() - img.min())
# # Convert to magnitude spectrogram
# spec = librosa.db_to_amplitude(librosa.feature.melspectrogram(S=img, sr=rate, n_fft=n_fft))
# # Convert to PyTorch tensor
# spec_tensor = torch.tensor(spec)
# # Convert the spectrogram to audio using Griffin-Lim
# mel_scale = torchaudio.transforms.MelScale(n_stft=n_fft, n_mels=spec.shape[0], sample_rate=rate)
# mel_inverse = torchaudio.transforms.InverseMelScale(n_stft=n_fft, n_mels=spec.shape[0], sample_rate=rate)
# audio = mel_inverse(mel_scale(spec_tensor))
# return rate, audio.numpy()[0]
# time = gr.Number(3.0, label="Audio time")
# image = gr.Image(label="Image to sonify")
# n_fft = gr.Number(1024, label="n_fft")
# def main(image, time, n_fft):
# return img_to_audio(image, time=time, rate=22050, n_fft=int(n_fft))
# desc = "Upload an image you would like to hear."
# interface = gr.Interface(fn=main, inputs=[image, time, n_fft], outputs="audio", title="Improved Image Sonification", description=desc)
# interface.launch()
import gradio as gr
import librosa
import numpy as np
import torch
import torchaudio
from PIL import Image
def img_to_audio(image, time=3.0, rate=22050, n_fft=1024, n_iter=64):
# Load the image and convert to grayscale
img = Image.fromarray(image).convert("L")
# Calculate spectrogram size
spec_shape = (int(librosa.time_to_frames(time, sr=rate, hop_length=n_fft//2, n_fft=n_fft)), n_fft)
# Resize the image to match the spectrogram shape
img = np.asarray(img.resize(spec_shape))
# Normalize the image
img = (img - img.min()) / (img.max() - img.min())
# Convert to magnitude spectrogram
spec = librosa.db_to_amplitude(librosa.feature.melspectrogram(S=img, sr=rate, n_fft=n_fft))
# Convert to PyTorch tensor
spec_tensor = torch.tensor(spec)
# Convert the spectrogram to audio using Griffin-Lim
mel_scale = torchaudio.transforms.MelScale(n_stft=n_fft, n_mels=spec.shape[0], sample_rate=rate)
mel_inverse = torchaudio.transforms.InverseMelScale(n_mels=spec.shape[0], sample_rate=rate, n_stft=n_fft)
spec_mel = mel_scale(spec_tensor)
if spec_mel.shape[2] > 0:
spec_mel_t = np.transpose(spec_mel, (0, 2, 1))
spec_mel_t = spec_mel_t.reshape(spec_mel_t.shape[0] * spec_mel_t.shape[1], spec_mel_t.shape[2])
audio = mel_inverse(spec_mel_t.unsqueeze(0)).squeeze(0)
else:
audio = np.zeros((1,))
return rate, audio.numpy()
time = gr.Number(3.0, label="Audio time")
image = gr.Image(label="Image to sonify")
n_fft = gr.Number(1024, label="n_fft")
def main(image, time, n_fft):
return img_to_audio(image, time=time, rate=22050, n_fft=int(n_fft))
desc = "Upload an image you would like to hear."
interface = gr.Interface(fn=main, inputs=[image, time, n_fft], outputs="audio", title="Improved Image Sonification", description=desc)
interface.launch()