Skip to content

Commit

Permalink
Add MP3 support (with minimp3) for Audio Dataset and IOTensor (tensor…
Browse files Browse the repository at this point in the history
…flow#801)

* Add MP3 support (with minimp3) for Audio Dataset and IOTensor

This PR adds MP3 support (with minimp3) for Audio Dataset and IOTensor.
The difference between this PR and FFmpeg is that FFmpeg is only supported
on specific Ubuntu 16.04 and 18.04, though external .so.
This PR builds minimp3 as part of the program and support Windows/Linux/macOS.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update minimp3 to carry patch on Windows

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
  • Loading branch information
yongtang authored Feb 21, 2020
1 parent 9f739a2 commit 053073c
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 1 deletion.
10 changes: 10 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,16 @@ http_archive(
],
)

http_archive(
name = "minimp3",
build_file = "//third_party:minimp3.BUILD",
sha256 = "5e216d54cb0423d99b92c5e81682cc22a9a1b028e961f56d52878cc967930bee",
strip_prefix = "minimp3-9229f280ad475a434d7592255dc01534db65504f",
urls = [
"https://github.com/lieff/minimp3/archive/9229f280ad475a434d7592255dc01534db65504f.tar.gz",
],
)

http_archive(
name = "postgresql",
build_file = "//third_party:postgresql.BUILD",
Expand Down
1 change: 1 addition & 0 deletions tensorflow_io/core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ cc_library(
deps = [
"//tensorflow_io/core:dataset_ops",
"@flac",
"@minimp3",
"@vorbis",
],
alwayslink = 1,
Expand Down
117 changes: 116 additions & 1 deletion tensorflow_io/core/kernels/audio_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ limitations under the License.
#include "tensorflow_io/core/kernels/io_stream.h"
#include "vorbis/codec.h"
#include "vorbis/vorbisfile.h"
#define MINIMP3_IMPLEMENTATION
#include "minimp3_ex.h"

namespace tensorflow {
namespace data {
Expand Down Expand Up @@ -698,6 +700,119 @@ class FlacReadableResource : public AudioReadableResourceBase {
std::unique_ptr<FlacStreamDecoder> stream_decoder_;
};

class MP3Stream {
public:
MP3Stream(SizedRandomAccessFile* file, int64 size)
: file(file), size(size), offset(0) {}
~MP3Stream() {}

static size_t ReadCallback(void* buf, size_t size, void* user_data) {
MP3Stream* p = static_cast<MP3Stream*>(user_data);
StringPiece result;
Status status = p->file->Read(p->offset, size, &result, (char*)buf);
p->offset += result.size();
return result.size();
}

static int SeekCallback(uint64_t position, void* user_data) {
MP3Stream* p = static_cast<MP3Stream*>(user_data);
if (position < 0 || position > p->size) {
return -1;
}
p->offset = position;
return 0;
}

SizedRandomAccessFile* file = nullptr;
int64 size = 0;
long offset = 0;
};

class MP3ReadableResource : public AudioReadableResourceBase {
public:
MP3ReadableResource(Env* env) : env_(env) {}
~MP3ReadableResource() {}

Status Init(const string& input) override {
mutex_lock l(mu_);

const string& filename = input;
file_.reset(new SizedRandomAccessFile(env_, filename, nullptr, 0));
TF_RETURN_IF_ERROR(file_->GetFileSize(&file_size_));

stream_.reset(new MP3Stream(file_.get(), file_size_));

mp3dec_io_.read = MP3Stream::ReadCallback;
mp3dec_io_.read_data = stream_.get();
mp3dec_io_.seek = MP3Stream::SeekCallback;
mp3dec_io_.seek_data = stream_.get();
memset(&mp3dec_ex_, 0x00, sizeof(mp3dec_ex_));
if (mp3dec_ex_open_cb(&mp3dec_ex_, &mp3dec_io_, MP3D_SEEK_TO_SAMPLE)) {
return errors::InvalidArgument("unable to open file ", filename,
" as mp3: ", mp3dec_ex_.last_error);
}
int64 samples = mp3dec_ex_.samples / mp3dec_ex_.info.channels;
int64 channels = mp3dec_ex_.info.channels;
int64 rate = mp3dec_ex_.info.hz;

shape_ = TensorShape({samples, channels});
dtype_ = DT_INT16;
rate_ = rate;

return Status::OK();
}

Status Spec(TensorShape* shape, DataType* dtype, int32* rate) override {
mutex_lock l(mu_);
*shape = shape_;
*dtype = dtype_;
*rate = rate_;
return Status::OK();
}

Status Read(const int64 start, const int64 stop,
std::function<Status(const TensorShape& shape, Tensor** value)>
allocate_func) override {
mutex_lock l(mu_);

int64 sample_stop =
(stop < 0) ? (shape_.dim_size(0))
: (stop < shape_.dim_size(0) ? stop : shape_.dim_size(0));
int64 sample_start = (start >= sample_stop) ? sample_stop : start;

Tensor* value;
TF_RETURN_IF_ERROR(allocate_func(
TensorShape({sample_stop - sample_start, shape_.dim_size(1)}), &value));

if (mp3dec_ex_seek(&mp3dec_ex_, sample_start * shape_.dim_size(1))) {
return errors::InvalidArgument("seek to ", sample_start,
" failed: ", mp3dec_ex_.last_error);
}
size_t returned = mp3dec_ex_read(&mp3dec_ex_, value->flat<int16>().data(),
value->NumElements());
if (returned != value->NumElements()) {
return errors::InvalidArgument("read ", value->NumElements(), " from ",
sample_start,
" failed: ", mp3dec_ex_.last_error);
}
return Status::OK();
}
string DebugString() const override { return "MP3ReadableResource"; }

private:
mutable mutex mu_;
Env* env_ GUARDED_BY(mu_);
std::unique_ptr<SizedRandomAccessFile> file_ GUARDED_BY(mu_);
uint64 file_size_ GUARDED_BY(mu_);
DataType dtype_;
TensorShape shape_;
int64 rate_;

std::unique_ptr<MP3Stream> stream_;
mp3dec_io_t mp3dec_io_;
mp3dec_ex_t mp3dec_ex_;
};

class AudioReadableResource : public AudioReadableResourceBase {
public:
AudioReadableResource(Env* env) : env_(env), resource_(nullptr) {}
Expand All @@ -717,7 +832,7 @@ class AudioReadableResource : public AudioReadableResourceBase {
} else if (memcmp(header, "fLaC", 4) == 0) {
resource_.reset(new FlacReadableResource(env_));
} else {
return errors::InvalidArgument("unknown header: ", header);
resource_.reset(new MP3ReadableResource(env_));
}
return resource_->Init(input);
}
Expand Down
Binary file added tests/test_audio/l1-fl6.bit
Binary file not shown.
Binary file added tests/test_audio/l1-fl6.pcm
Binary file not shown.
Binary file added tests/test_audio/l1-fl6.raw
Binary file not shown.
33 changes: 33 additions & 0 deletions tests/test_io_dataset_eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,27 @@ def fixture_audio_flac():

return args, func, expected

@pytest.fixture(name="audio_mp3", scope="module")
def fixture_audio_mp3():
"""fixture_audio_mp3"""
# l1-fl6.bit was taken from minimp3
# l1-fl6.raw is the converted, through minimp3
path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"test_audio", "l1-fl6.bit")
raw_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"test_audio", "l1-fl6.raw")
raw = np.fromfile(raw_path, np.int16)
raw = raw.reshape([-1, 2])
value = tf.cast(raw, tf.int16)

args = path
func = lambda args: tfio.IODataset.graph(tf.int16).from_audio(args)
expected = [v for _, v in enumerate(value)]

return args, func, expected

@pytest.fixture(name="hdf5", scope="module")
def fixture_hdf5(request):
"""fixture_hdf5"""
Expand Down Expand Up @@ -812,6 +833,7 @@ def func(q):
],
),
pytest.param("audio_flac"),
pytest.param("audio_mp3"),
pytest.param(
"prometheus_scrape",
marks=[
Expand Down Expand Up @@ -856,6 +878,7 @@ def func(q):
"audio[wav/24bit]",
"audio[ogg]",
"audio[flac]",
"audio[mp3]",
"prometheus[scrape]",
"kinesis",
"pubsub",
Expand Down Expand Up @@ -912,6 +935,7 @@ def test_io_dataset_basic(fixture_lookup, io_dataset_fixture):
],
),
pytest.param("audio_flac"),
pytest.param("audio_mp3"),
pytest.param(
"prometheus_scrape",
marks=[
Expand Down Expand Up @@ -952,6 +976,7 @@ def test_io_dataset_basic(fixture_lookup, io_dataset_fixture):
"audio[wav/24bit]",
"audio[ogg]",
"audio[flac]",
"audio[mp3]",
"prometheus[scrape]",
"hdf5",
"grpc",
Expand Down Expand Up @@ -1027,6 +1052,7 @@ def test_io_dataset_basic_operation(fixture_lookup, io_dataset_fixture):
],
),
pytest.param("audio_flac"),
pytest.param("audio_mp3"),
pytest.param("hdf5"),
pytest.param("grpc"),
pytest.param("numpy"),
Expand All @@ -1053,6 +1079,7 @@ def test_io_dataset_basic_operation(fixture_lookup, io_dataset_fixture):
"audio[wav/24bit]",
"audio[ogg]",
"audio[flac]",
"audio[mp3]",
"hdf5",
"grpc",
"numpy",
Expand Down Expand Up @@ -1142,6 +1169,8 @@ def test_io_dataset_for_training(fixture_lookup, io_dataset_fixture):
),
pytest.param("audio_flac", None),
pytest.param("audio_flac", 2),
pytest.param("audio_mp3", None),
pytest.param("audio_mp3", 2),
pytest.param("hdf5_graph", None),
pytest.param("hdf5_graph", 2),
pytest.param("numpy_file_tuple_graph", None),
Expand Down Expand Up @@ -1186,6 +1215,8 @@ def test_io_dataset_for_training(fixture_lookup, io_dataset_fixture):
"audio[ogg]|2",
"audio[flac]",
"audio[flac]|2",
"audio[mp3]",
"audio[mp3]|2",
"hdf5",
"hdf5|2",
"numpy[file/tuple]",
Expand Down Expand Up @@ -1260,6 +1291,7 @@ def f(v):
],
),
pytest.param("audio_flac"),
pytest.param("audio_mp3"),
pytest.param("hdf5"),
pytest.param("numpy"),
pytest.param("numpy_structure"),
Expand All @@ -1282,6 +1314,7 @@ def f(v):
"audio[wav/24bit]",
"audio[ogg]",
"audio[flac]",
"audio[mp3]",
"hdf5",
"numpy",
"numpy[structure]",
Expand Down
46 changes: 46 additions & 0 deletions tests/test_io_tensor_eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,40 @@ def fixture_audio_rate_flac():

return args, func, expected

@pytest.fixture(name="audio_mp3", scope="module")
def fixture_audio_mp3():
"""fixture_audio_mp3"""
# l1-fl6.bit was taken from minimp3
# l1-fl6.raw is the converted, through minimp3
path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"test_audio", "l1-fl6.bit")
raw_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"test_audio", "l1-fl6.raw")
raw = np.fromfile(raw_path, np.int16)
raw = raw.reshape([-1, 2])
value = tf.cast(raw, tf.int16)

args = path
func = lambda args: tfio.IOTensor.graph(tf.int16).from_audio(args)
expected = value

return args, func, expected

@pytest.fixture(name="audio_rate_mp3", scope="module")
def fixture_audio_rate_mp3():
"""fixture_audio_rate_mp3"""
path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"test_audio", "l1-fl6.bit")

args = path
func = lambda args: tfio.IOTensor.graph(tf.int16).from_audio(args).rate
expected = tf.constant(44100)

return args, func, expected

@pytest.fixture(name="kafka")
def fixture_kafka():
"""fixture_kafka"""
Expand Down Expand Up @@ -347,6 +381,7 @@ def test_io_tensor_scalar(fixture_lookup, io_tensor_fixture):
pytest.param("audio_wav_24"),
pytest.param("audio_ogg"),
pytest.param("audio_flac"),
pytest.param("audio_mp3"),
pytest.param("hdf5"),
pytest.param("kafka"),
pytest.param("arrow"),
Expand All @@ -356,6 +391,7 @@ def test_io_tensor_scalar(fixture_lookup, io_tensor_fixture):
"audio[wav/24bit]",
"audio[ogg]",
"audio[flac]",
"audio[mp3]",
"hdf5",
"kafka",
"arrow",
Expand Down Expand Up @@ -415,6 +451,8 @@ def test_io_tensor_slice_multiple_dimension(fixture_lookup, io_tensor_fixture):
pytest.param("audio_ogg", 2),
pytest.param("audio_flac", None),
pytest.param("audio_flac", 2),
pytest.param("audio_mp3", None),
pytest.param("audio_mp3", 2),
pytest.param("hdf5_graph", None),
pytest.param("hdf5_graph", 2),
pytest.param("kafka", None),
Expand All @@ -441,6 +479,8 @@ def test_io_tensor_slice_multiple_dimension(fixture_lookup, io_tensor_fixture):
"audio[ogg]|2",
"audio[flac]",
"audio[flac]|2",
"audio[mp3]",
"audio[mp3]|2",
"hdf5",
"hdf5|2",
"kafka",
Expand Down Expand Up @@ -498,12 +538,14 @@ def g(e):
pytest.param("audio_rate_wav_24"),
pytest.param("audio_rate_ogg"),
pytest.param("audio_rate_flac"),
pytest.param("audio_rate_mp3"),
],
ids=[
"audio[rate][wav]",
"audio[rate][wav/24bit]",
"audio[rate][ogg]",
"audio[rate][flac]",
"audio[rate][mp3]",
],
)
def test_io_tensor_meta(fixture_lookup, io_tensor_fixture):
Expand All @@ -522,12 +564,14 @@ def test_io_tensor_meta(fixture_lookup, io_tensor_fixture):
pytest.param("audio_rate_wav_24"),
pytest.param("audio_rate_ogg"),
pytest.param("audio_rate_flac"),
pytest.param("audio_rate_mp3"),
],
ids=[
"audio[rate][wav]",
"audio[rate][wav/24bit]",
"audio[rate][ogg]",
"audio[rate][flac]",
"audio[rate][mp3]",
],
)
def test_io_tensor_meta_in_dataset(fixture_lookup, io_tensor_fixture):
Expand Down Expand Up @@ -561,6 +605,7 @@ def f(e):
pytest.param("audio_wav_24"),
pytest.param("audio_ogg"),
pytest.param("audio_flac"),
pytest.param("audio_mp3"),
pytest.param("hdf5"),
pytest.param("arrow"),
],
Expand All @@ -569,6 +614,7 @@ def f(e):
"audio[wav/24bit]",
"audio[ogg]",
"audio[flac]",
"audio[mp3]",
"hdf5",
"arrow",
],
Expand Down
Loading

0 comments on commit 053073c

Please sign in to comment.