Add MP3 support (with minimp3) for Audio Dataset and IOTensor (tensor…

…flow#801) * Add MP3 support (with minimp3) for Audio Dataset and IOTensor This PR adds MP3 support (with minimp3) for Audio Dataset and IOTensor. The difference between this PR and FFmpeg is that FFmpeg is only supported on specific Ubuntu 16.04 and 18.04, though external .so. This PR builds minimp3 as part of the program and support Windows/Linux/macOS. Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Update minimp3 to carry patch on Windows Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
i-ony · Feb 21, 2020 · 053073c · 053073c
1 parent 9f739a2
commit 053073c
Show file tree

Hide file tree

Showing 9 changed files with 218 additions and 1 deletion.
diff --git a/WORKSPACE b/WORKSPACE
@@ -721,6 +721,16 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "minimp3",
+    build_file = "//third_party:minimp3.BUILD",
+    sha256 = "5e216d54cb0423d99b92c5e81682cc22a9a1b028e961f56d52878cc967930bee",
+    strip_prefix = "minimp3-9229f280ad475a434d7592255dc01534db65504f",
+    urls = [
+        "https://github.com/lieff/minimp3/archive/9229f280ad475a434d7592255dc01534db65504f.tar.gz",
+    ],
+)
+
 http_archive(
     name = "postgresql",
     build_file = "//third_party:postgresql.BUILD",

diff --git a/tensorflow_io/core/BUILD b/tensorflow_io/core/BUILD
@@ -196,6 +196,7 @@ cc_library(
     deps = [
         "//tensorflow_io/core:dataset_ops",
         "@flac",
+        "@minimp3",
         "@vorbis",
     ],
     alwayslink = 1,

diff --git a/tensorflow_io/core/kernels/audio_kernels.cc b/tensorflow_io/core/kernels/audio_kernels.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow_io/core/kernels/io_stream.h"
 #include "vorbis/codec.h"
 #include "vorbis/vorbisfile.h"
+#define MINIMP3_IMPLEMENTATION
+#include "minimp3_ex.h"
 
 namespace tensorflow {
 namespace data {
@@ -698,6 +700,119 @@ class FlacReadableResource : public AudioReadableResourceBase {
   std::unique_ptr<FlacStreamDecoder> stream_decoder_;
 };
 
+class MP3Stream {
+ public:
+  MP3Stream(SizedRandomAccessFile* file, int64 size)
+      : file(file), size(size), offset(0) {}
+  ~MP3Stream() {}
+
+  static size_t ReadCallback(void* buf, size_t size, void* user_data) {
+    MP3Stream* p = static_cast<MP3Stream*>(user_data);
+    StringPiece result;
+    Status status = p->file->Read(p->offset, size, &result, (char*)buf);
+    p->offset += result.size();
+    return result.size();
+  }
+
+  static int SeekCallback(uint64_t position, void* user_data) {
+    MP3Stream* p = static_cast<MP3Stream*>(user_data);
+    if (position < 0 || position > p->size) {
+      return -1;
+    }
+    p->offset = position;
+    return 0;
+  }
+
+  SizedRandomAccessFile* file = nullptr;
+  int64 size = 0;
+  long offset = 0;
+};
+
+class MP3ReadableResource : public AudioReadableResourceBase {
+ public:
+  MP3ReadableResource(Env* env) : env_(env) {}
+  ~MP3ReadableResource() {}
+
+  Status Init(const string& input) override {
+    mutex_lock l(mu_);
+
+    const string& filename = input;
+    file_.reset(new SizedRandomAccessFile(env_, filename, nullptr, 0));
+    TF_RETURN_IF_ERROR(file_->GetFileSize(&file_size_));
+
+    stream_.reset(new MP3Stream(file_.get(), file_size_));
+
+    mp3dec_io_.read = MP3Stream::ReadCallback;
+    mp3dec_io_.read_data = stream_.get();
+    mp3dec_io_.seek = MP3Stream::SeekCallback;
+    mp3dec_io_.seek_data = stream_.get();
+    memset(&mp3dec_ex_, 0x00, sizeof(mp3dec_ex_));
+    if (mp3dec_ex_open_cb(&mp3dec_ex_, &mp3dec_io_, MP3D_SEEK_TO_SAMPLE)) {
+      return errors::InvalidArgument("unable to open file ", filename,
+                                     " as mp3: ", mp3dec_ex_.last_error);
+    }
+    int64 samples = mp3dec_ex_.samples / mp3dec_ex_.info.channels;
+    int64 channels = mp3dec_ex_.info.channels;
+    int64 rate = mp3dec_ex_.info.hz;
+
+    shape_ = TensorShape({samples, channels});
+    dtype_ = DT_INT16;
+    rate_ = rate;
+
+    return Status::OK();
+  }
+
+  Status Spec(TensorShape* shape, DataType* dtype, int32* rate) override {
+    mutex_lock l(mu_);
+    *shape = shape_;
+    *dtype = dtype_;
+    *rate = rate_;
+    return Status::OK();
+  }
+
+  Status Read(const int64 start, const int64 stop,
+              std::function<Status(const TensorShape& shape, Tensor** value)>
+                  allocate_func) override {
+    mutex_lock l(mu_);
+
+    int64 sample_stop =
+        (stop < 0) ? (shape_.dim_size(0))
+                   : (stop < shape_.dim_size(0) ? stop : shape_.dim_size(0));
+    int64 sample_start = (start >= sample_stop) ? sample_stop : start;
+
+    Tensor* value;
+    TF_RETURN_IF_ERROR(allocate_func(
+        TensorShape({sample_stop - sample_start, shape_.dim_size(1)}), &value));
+
+    if (mp3dec_ex_seek(&mp3dec_ex_, sample_start * shape_.dim_size(1))) {
+      return errors::InvalidArgument("seek to ", sample_start,
+                                     " failed: ", mp3dec_ex_.last_error);
+    }
+    size_t returned = mp3dec_ex_read(&mp3dec_ex_, value->flat<int16>().data(),
+                                     value->NumElements());
+    if (returned != value->NumElements()) {
+      return errors::InvalidArgument("read ", value->NumElements(), " from ",
+                                     sample_start,
+                                     " failed: ", mp3dec_ex_.last_error);
+    }
+    return Status::OK();
+  }
+  string DebugString() const override { return "MP3ReadableResource"; }
+
+ private:
+  mutable mutex mu_;
+  Env* env_ GUARDED_BY(mu_);
+  std::unique_ptr<SizedRandomAccessFile> file_ GUARDED_BY(mu_);
+  uint64 file_size_ GUARDED_BY(mu_);
+  DataType dtype_;
+  TensorShape shape_;
+  int64 rate_;
+
+  std::unique_ptr<MP3Stream> stream_;
+  mp3dec_io_t mp3dec_io_;
+  mp3dec_ex_t mp3dec_ex_;
+};
+
 class AudioReadableResource : public AudioReadableResourceBase {
  public:
   AudioReadableResource(Env* env) : env_(env), resource_(nullptr) {}
@@ -717,7 +832,7 @@ class AudioReadableResource : public AudioReadableResourceBase {
     } else if (memcmp(header, "fLaC", 4) == 0) {
       resource_.reset(new FlacReadableResource(env_));
     } else {
-      return errors::InvalidArgument("unknown header: ", header);
+      resource_.reset(new MP3ReadableResource(env_));
     }
     return resource_->Init(input);
   }

diff --git a/tests/test_audio/l1-fl6.bit b/tests/test_audio/l1-fl6.bit
diff --git a/tests/test_audio/l1-fl6.pcm b/tests/test_audio/l1-fl6.pcm
diff --git a/tests/test_audio/l1-fl6.raw b/tests/test_audio/l1-fl6.raw
diff --git a/tests/test_io_dataset_eager.py b/tests/test_io_dataset_eager.py
@@ -380,6 +380,27 @@ def fixture_audio_flac():
 
   return args, func, expected
 
+@pytest.fixture(name="audio_mp3", scope="module")
+def fixture_audio_mp3():
+  """fixture_audio_mp3"""
+  # l1-fl6.bit was taken from minimp3
+  # l1-fl6.raw is the converted, through minimp3
+  path = os.path.join(
+      os.path.dirname(os.path.abspath(__file__)),
+      "test_audio", "l1-fl6.bit")
+  raw_path = os.path.join(
+      os.path.dirname(os.path.abspath(__file__)),
+      "test_audio", "l1-fl6.raw")
+  raw = np.fromfile(raw_path, np.int16)
+  raw = raw.reshape([-1, 2])
+  value = tf.cast(raw, tf.int16)
+
+  args = path
+  func = lambda args: tfio.IODataset.graph(tf.int16).from_audio(args)
+  expected = [v for _, v in enumerate(value)]
+
+  return args, func, expected
+
 @pytest.fixture(name="hdf5", scope="module")
 def fixture_hdf5(request):
   """fixture_hdf5"""
@@ -812,6 +833,7 @@ def func(q):
             ],
         ),
         pytest.param("audio_flac"),
+        pytest.param("audio_mp3"),
         pytest.param(
             "prometheus_scrape",
             marks=[
@@ -856,6 +878,7 @@ def func(q):
         "audio[wav/24bit]",
         "audio[ogg]",
         "audio[flac]",
+        "audio[mp3]",
         "prometheus[scrape]",
         "kinesis",
         "pubsub",
@@ -912,6 +935,7 @@ def test_io_dataset_basic(fixture_lookup, io_dataset_fixture):
             ],
         ),
         pytest.param("audio_flac"),
+        pytest.param("audio_mp3"),
         pytest.param(
             "prometheus_scrape",
             marks=[
@@ -952,6 +976,7 @@ def test_io_dataset_basic(fixture_lookup, io_dataset_fixture):
         "audio[wav/24bit]",
         "audio[ogg]",
         "audio[flac]",
+        "audio[mp3]",
         "prometheus[scrape]",
         "hdf5",
         "grpc",
@@ -1027,6 +1052,7 @@ def test_io_dataset_basic_operation(fixture_lookup, io_dataset_fixture):
             ],
         ),
         pytest.param("audio_flac"),
+        pytest.param("audio_mp3"),
         pytest.param("hdf5"),
         pytest.param("grpc"),
         pytest.param("numpy"),
@@ -1053,6 +1079,7 @@ def test_io_dataset_basic_operation(fixture_lookup, io_dataset_fixture):
         "audio[wav/24bit]",
         "audio[ogg]",
         "audio[flac]",
+        "audio[mp3]",
         "hdf5",
         "grpc",
         "numpy",
@@ -1142,6 +1169,8 @@ def test_io_dataset_for_training(fixture_lookup, io_dataset_fixture):
         ),
         pytest.param("audio_flac", None),
         pytest.param("audio_flac", 2),
+        pytest.param("audio_mp3", None),
+        pytest.param("audio_mp3", 2),
         pytest.param("hdf5_graph", None),
         pytest.param("hdf5_graph", 2),
         pytest.param("numpy_file_tuple_graph", None),
@@ -1186,6 +1215,8 @@ def test_io_dataset_for_training(fixture_lookup, io_dataset_fixture):
         "audio[ogg]|2",
         "audio[flac]",
         "audio[flac]|2",
+        "audio[mp3]",
+        "audio[mp3]|2",
         "hdf5",
         "hdf5|2",
         "numpy[file/tuple]",
@@ -1260,6 +1291,7 @@ def f(v):
             ],
         ),
         pytest.param("audio_flac"),
+        pytest.param("audio_mp3"),
         pytest.param("hdf5"),
         pytest.param("numpy"),
         pytest.param("numpy_structure"),
@@ -1282,6 +1314,7 @@ def f(v):
         "audio[wav/24bit]",
         "audio[ogg]",
         "audio[flac]",
+        "audio[mp3]",
         "hdf5",
         "numpy",
         "numpy[structure]",

diff --git a/tests/test_io_tensor_eager.py b/tests/test_io_tensor_eager.py
@@ -169,6 +169,40 @@ def fixture_audio_rate_flac():
 
   return args, func, expected
 
+@pytest.fixture(name="audio_mp3", scope="module")
+def fixture_audio_mp3():
+  """fixture_audio_mp3"""
+  # l1-fl6.bit was taken from minimp3
+  # l1-fl6.raw is the converted, through minimp3
+  path = os.path.join(
+      os.path.dirname(os.path.abspath(__file__)),
+      "test_audio", "l1-fl6.bit")
+  raw_path = os.path.join(
+      os.path.dirname(os.path.abspath(__file__)),
+      "test_audio", "l1-fl6.raw")
+  raw = np.fromfile(raw_path, np.int16)
+  raw = raw.reshape([-1, 2])
+  value = tf.cast(raw, tf.int16)
+
+  args = path
+  func = lambda args: tfio.IOTensor.graph(tf.int16).from_audio(args)
+  expected = value
+
+  return args, func, expected
+
+@pytest.fixture(name="audio_rate_mp3", scope="module")
+def fixture_audio_rate_mp3():
+  """fixture_audio_rate_mp3"""
+  path = os.path.join(
+      os.path.dirname(os.path.abspath(__file__)),
+      "test_audio", "l1-fl6.bit")
+
+  args = path
+  func = lambda args: tfio.IOTensor.graph(tf.int16).from_audio(args).rate
+  expected = tf.constant(44100)
+
+  return args, func, expected
+
 @pytest.fixture(name="kafka")
 def fixture_kafka():
   """fixture_kafka"""
@@ -347,6 +381,7 @@ def test_io_tensor_scalar(fixture_lookup, io_tensor_fixture):
         pytest.param("audio_wav_24"),
         pytest.param("audio_ogg"),
         pytest.param("audio_flac"),
+        pytest.param("audio_mp3"),
         pytest.param("hdf5"),
         pytest.param("kafka"),
         pytest.param("arrow"),
@@ -356,6 +391,7 @@ def test_io_tensor_scalar(fixture_lookup, io_tensor_fixture):
         "audio[wav/24bit]",
         "audio[ogg]",
         "audio[flac]",
+        "audio[mp3]",
         "hdf5",
         "kafka",
         "arrow",
@@ -415,6 +451,8 @@ def test_io_tensor_slice_multiple_dimension(fixture_lookup, io_tensor_fixture):
         pytest.param("audio_ogg", 2),
         pytest.param("audio_flac", None),
         pytest.param("audio_flac", 2),
+        pytest.param("audio_mp3", None),
+        pytest.param("audio_mp3", 2),
         pytest.param("hdf5_graph", None),
         pytest.param("hdf5_graph", 2),
         pytest.param("kafka", None),
@@ -441,6 +479,8 @@ def test_io_tensor_slice_multiple_dimension(fixture_lookup, io_tensor_fixture):
         "audio[ogg]|2",
         "audio[flac]",
         "audio[flac]|2",
+        "audio[mp3]",
+        "audio[mp3]|2",
         "hdf5",
         "hdf5|2",
         "kafka",
@@ -498,12 +538,14 @@ def g(e):
         pytest.param("audio_rate_wav_24"),
         pytest.param("audio_rate_ogg"),
         pytest.param("audio_rate_flac"),
+        pytest.param("audio_rate_mp3"),
     ],
     ids=[
         "audio[rate][wav]",
         "audio[rate][wav/24bit]",
         "audio[rate][ogg]",
         "audio[rate][flac]",
+        "audio[rate][mp3]",
     ],
 )
 def test_io_tensor_meta(fixture_lookup, io_tensor_fixture):
@@ -522,12 +564,14 @@ def test_io_tensor_meta(fixture_lookup, io_tensor_fixture):
         pytest.param("audio_rate_wav_24"),
         pytest.param("audio_rate_ogg"),
         pytest.param("audio_rate_flac"),
+        pytest.param("audio_rate_mp3"),
     ],
     ids=[
         "audio[rate][wav]",
         "audio[rate][wav/24bit]",
         "audio[rate][ogg]",
         "audio[rate][flac]",
+        "audio[rate][mp3]",
     ],
 )
 def test_io_tensor_meta_in_dataset(fixture_lookup, io_tensor_fixture):
@@ -561,6 +605,7 @@ def f(e):
         pytest.param("audio_wav_24"),
         pytest.param("audio_ogg"),
         pytest.param("audio_flac"),
+        pytest.param("audio_mp3"),
         pytest.param("hdf5"),
         pytest.param("arrow"),
     ],
@@ -569,6 +614,7 @@ def f(e):
         "audio[wav/24bit]",
         "audio[ogg]",
         "audio[flac]",
+        "audio[mp3]",
         "hdf5",
         "arrow",
     ],