From 0bbb8afd0f11b3e85f90deac0846d13d999920bd Mon Sep 17 00:00:00 2001
From: Qubad786 <muhammadrehan69@gmail.com>
Date: Mon, 21 May 2018 18:31:33 +0500
Subject: [PATCH] Export transcripts metadata along with xml  - Transcript
 files are exported into course OLX in .srt format.  - Transcript language to
 filename maps is returned with xml, so that, it can be used by platform in
 old metadata fields for backward compatiblilty.  - Add/fix tests

bump VAL version
---
 edxval/api.py                         | 102 ++++++++++++++--------
 edxval/exceptions.py                  |   7 ++
 edxval/tests/test_api.py              |  66 +++++++++------
 edxval/tests/test_transcript_utils.py |  94 +++++++++++++++++++++
 edxval/transcript_utils.py            | 117 ++++++++++++++++++++++++++
 edxval/utils.py                       |   4 +-
 setup.py                              |   2 +-
 7 files changed, 327 insertions(+), 65 deletions(-)
 create mode 100644 edxval/tests/test_transcript_utils.py
 create mode 100644 edxval/transcript_utils.py

diff --git a/edxval/api.py b/edxval/api.py
index 6141ec3d..03480362 100644
--- a/edxval/api.py
+++ b/edxval/api.py
@@ -3,6 +3,7 @@
 """
 The internal API for VAL.
 """
+import os
 import logging
 from enum import Enum
 from uuid import uuid4
@@ -17,15 +18,35 @@
 from lxml.etree import Element, SubElement
 from pysrt.srtexc import Error
 
-from edxval.exceptions import (InvalidTranscriptFormat,
-                               InvalidTranscriptProvider, ValCannotCreateError,
-                               ValCannotUpdateError, ValInternalError,
-                               ValVideoNotFoundError)
-from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
-                           TranscriptProviderType, Video, VideoImage,
-                           VideoTranscript, ThirdPartyTranscriptCredentialsState)
+from edxval.exceptions import (
+    InvalidTranscriptFormat,
+    TranscriptsGenerationException,
+    InvalidTranscriptProvider,
+    ValCannotCreateError,
+    ValCannotUpdateError,
+    ValInternalError,
+    ValVideoNotFoundError,
+)
+from edxval.models import (
+    CourseVideo,
+    EncodedVideo,
+    Profile,
+    TranscriptPreference,
+    TranscriptProviderType,
+    Video,
+    VideoImage,
+    VideoTranscript,
+    ThirdPartyTranscriptCredentialsState,
+)
 from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
-from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
+from edxval.utils import (
+    TranscriptFormat,
+    THIRD_PARTY_TRANSCRIPTION_PLANS,
+    create_file_in_fs,
+    get_transcript_format,
+)
+
+from edxval.transcript_utils import Transcript
 
 
 logger = logging.getLogger(__name__)  # pylint: disable=C0103
@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
                 for name in ['profile', 'url', 'file_size', 'bitrate']
             }
         )
+
     return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)
 
 
@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
         static_dir (str): The Directory to store transcript file.
         resource_fs (SubFS): The file system to store transcripts.
     """
-    transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
+    transcript_filename = '{video_id}-{language_code}.srt'.format(
         video_id=video_id,
-        language_code=language_code,
-        file_format=file_format
+        language_code=language_code
     )
     transcript_data = get_video_transcript_data(video_id, language_code)
     if transcript_data:
-        transcript_content = transcript_data['content']
-        create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
+        transcript_content = Transcript.convert(
+            transcript_data['content'],
+            input_format=file_format,
+            output_format=Transcript.SRT
+        )
+        create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)
+
+    return transcript_filename
 
 
 def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
     """
     Creates xml for transcripts.
-    For each transcript elment, an associated transcript file is also created in course OLX.
+    For each transcript element, an associated transcript file is also created in course OLX.
 
     Arguments:
         video_id (str): Video id of the video.
@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
     if video_transcripts.exists():
         transcripts_el = SubElement(video_el, 'transcripts')
 
-    exported_language_codes = []
+    transcript_files_map = {}
     for video_transcript in video_transcripts:
-        if video_transcript.language_code not in exported_language_codes:
-            language_code = video_transcript.language_code
-            file_format = video_transcript.file_format
+        language_code = video_transcript.language_code
+        file_format = video_transcript.file_format
 
-            create_transcript_file(
-                video_id,
-                language_code,
-                file_format,
-                resource_fs.delegate_fs(),
-                combine(u'course', static_dir)  # File system should not start from /draft directory.
+        try:
+            transcript_filename = create_transcript_file(
+                video_id=video_id,
+                language_code=language_code,
+                file_format=file_format,
+                resource_fs=resource_fs.delegate_fs(),
+                static_dir=combine(u'course', static_dir)  # File system should not start from /draft directory.
             )
+            transcript_files_map[language_code] = transcript_filename
+        except TranscriptsGenerationException:
+            # we don't want to halt export in this case, just log and move to the next transcript.
+            logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
+            continue
 
-            SubElement(
-                transcripts_el,
-                'transcript',
-                {
-                    'language_code': language_code,
-                    'file_format': file_format,
-                    'provider': video_transcript.provider,
-                }
-            )
-            exported_language_codes.append(video_transcript.language_code)
+        SubElement(
+            transcripts_el,
+            'transcript',
+            {
+                'language_code': language_code,
+                'file_format': Transcript.SRT,
+                'provider': video_transcript.provider,
+            }
+        )
 
-    return video_el
+    return dict(xml=video_el, transcripts=transcript_files_map)
 
 
 def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
             )
             return
 
-
         # Get file format from transcript content.
         try:
             file_format = get_transcript_format(file_content)
diff --git a/edxval/exceptions.py b/edxval/exceptions.py
index 4ae9e98a..13446001 100644
--- a/edxval/exceptions.py
+++ b/edxval/exceptions.py
@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
     This error is raised when an transcript provider is not supported
     """
     pass
+
+
+class TranscriptsGenerationException(ValError):
+    """
+    This error is raised when a transcript content is not parse-able in specified format.
+    """
+    pass
diff --git a/edxval/tests/test_api.py b/edxval/tests/test_api.py
index 5f54151a..690dd17d 100644
--- a/edxval/tests/test_api.py
+++ b/edxval/tests/test_api.py
@@ -35,6 +35,7 @@
                            VideoTranscript)
 from edxval.serializers import VideoSerializer
 from edxval.tests import APIAuthTestCase, constants
+from edxval.transcript_utils import Transcript
 
 
 def omit_attrs(dict, attrs_to_omit=[]):
@@ -973,11 +974,14 @@ def test_no_encodings(self):
         expected = self.parse_xml("""
             <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
         """)
-        self.assert_xml_equal(
-            api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR),
-            expected
+        exported_metadata = api.export_to_xml(
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
+            video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
         )
 
+        self.assert_xml_equal(exported_metadata['xml'], expected)
+
     def test_no_video_transcript(self):
         """
         Verify that transcript export for video with no transcript is working as expected.
@@ -986,11 +990,12 @@ def test_no_video_transcript(self):
             <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
         """)
 
-        exported_xml = api.export_to_xml(
+        exported_metadata = api.export_to_xml(
             constants.VIDEO_DICT_STAR['edx_video_id'],
             self.file_system,
             constants.EXPORT_IMPORT_STATIC_DIR
         )
+        exported_xml = exported_metadata['xml']
         self.assert_xml_equal(exported_xml, expected)
 
         # Verify that no transcript is present in the XML.
@@ -1011,29 +1016,29 @@ def test_basic(self, course_id, image):
                 <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
                 <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
                 <transcripts>
-                    <transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
+                    <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
                     <transcript file_format="srt" language_code="en" provider="Cielo24" />
                 </transcripts>
             </video_asset>
         """.format(image=image))
 
-        self.assert_xml_equal(
-            api.export_to_xml(
-                constants.VIDEO_DICT_FISH['edx_video_id'],
-                self.file_system,
-                constants.EXPORT_IMPORT_STATIC_DIR,
-                course_id
-            ),
-            expected
+        exported_metadata = api.export_to_xml(
+            constants.VIDEO_DICT_FISH['edx_video_id'],
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            course_id
         )
 
+        self.assert_xml_equal(exported_metadata['xml'], expected)
+        self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])
+
     def test_transcript_export(self):
         """
         Test that transcript are exported correctly.
         """
         language_code = 'en'
         video_id = constants.VIDEO_DICT_FISH['edx_video_id']
-        transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
+        transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
         expected_transcript_path = combine(
             self.temp_dir,
             combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
@@ -1045,16 +1050,21 @@ def test_transcript_export(self):
                 <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
                 <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
                 <transcripts>
-                    <transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
+                    <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
                     <transcript file_format="srt" language_code="en" provider="Cielo24" />
                 </transcripts>
             </video_asset>
         """)
 
-        exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course')
+        exported_metadata = api.export_to_xml(
+            video_id=video_id,
+            course_id='test-course',
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+        )
 
         # Assert video and transcript xml is exported correctly.
-        self.assert_xml_equal(exported_xml, expected_xml)
+        self.assert_xml_equal(exported_metadata['xml'], expected_xml)
 
         # Verify transcript file is created.
         self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
@@ -1065,7 +1075,13 @@ def test_transcript_export(self):
                 open(combine(expected_transcript_path, transcript_files[language_code]))
             ).read()
             transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
-            self.assertEqual(transcript['content'], expected_transcript_content)
+            transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
+            exported_transcript_content = Transcript.convert(
+                transcript['content'],
+                input_format=transcript_format,
+                output_format=Transcript.SRT,
+            ).encode('utf-8')
+            self.assertEqual(exported_transcript_content, expected_transcript_content)
 
 
     def test_unknown_video(self):
@@ -1728,20 +1744,18 @@ def test_import_transcript_from_fs_bad_content(self, mock_logger):
         """
         language_code = 'en'
         edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
-        # First create transcript file.
+
+        # First create non utf-8 encoded transcript file in the file system.
         transcript_file_name = 'invalid-transcript.txt'
         invalid_transcript = dict(
             constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
             video_id=edx_video_id,
-            file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
-        )
-        utils.create_file_in_fs(
-            invalid_transcript['file_data'],
-            transcript_file_name,
-            self.file_system,
-            constants.EXPORT_IMPORT_STATIC_DIR
+            file_data=u'Привіт, edX вітає вас.'
         )
 
+        with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
+            f.write(invalid_transcript['file_data'].encode('cp1251'))
+
         api.import_transcript_from_fs(
             edx_video_id=edx_video_id,
             language_code=language_code,
diff --git a/edxval/tests/test_transcript_utils.py b/edxval/tests/test_transcript_utils.py
new file mode 100644
index 00000000..cd14eb79
--- /dev/null
+++ b/edxval/tests/test_transcript_utils.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for transcript utils.
+"""
+import ddt
+import json
+import textwrap
+import unittest
+
+from edxval.transcript_utils import Transcript
+from edxval.exceptions import TranscriptsGenerationException
+
+
+@ddt.ddt
+class TestTranscriptUtils(unittest.TestCase):
+    """
+    Tests transcripts conversion util.
+    """
+    def setUp(self):
+        super(TestTranscriptUtils, self).setUp()
+
+        self.srt_transcript = textwrap.dedent("""\
+            0
+            00:00:10,500 --> 00:00:13,000
+            Elephant&#39;s Dream 大象的梦想
+
+            1
+            00:00:15,000 --> 00:00:18,000
+            At the left we can see...
+
+        """)
+
+        self.sjson_transcript = textwrap.dedent("""\
+            {
+                "start": [
+                    10500,
+                    15000
+                ],
+                "end": [
+                    13000,
+                    18000
+                ],
+                "text": [
+                    "Elephant&#39;s Dream 大象的梦想",
+                    "At the left we can see..."
+                ]
+            }
+        """)
+
+    @ddt.data(
+        ('invalid_input_format', 'sjson'),
+        ('sjson', 'invalid_output_format'),
+        ('invalid_input_format', 'invalid_output_format')
+    )
+    @ddt.unpack
+    def test_invalid_transcript_format(self, input_format, output_format):
+        """
+        Tests that transcript conversion raises `AssertionError` on invalid input/output formats.
+        """
+        with self.assertRaises(AssertionError):
+            Transcript.convert(self.sjson_transcript, input_format, output_format)
+
+    def test_convert_srt_to_srt(self):
+        """
+        Tests that srt to srt conversion works as expected.
+        """
+        expected = self.srt_transcript.decode('utf-8')
+        actual = Transcript.convert(self.srt_transcript, 'srt', 'srt')
+        self.assertEqual(actual, expected)
+
+    def test_convert_sjson_to_srt(self):
+        """
+        Tests that the sjson transcript is successfully converted into srt format.
+        """
+        expected = self.srt_transcript.decode('utf-8')
+        actual = Transcript.convert(self.sjson_transcript, 'sjson', 'srt')
+        self.assertEqual(actual, expected)
+
+    def test_convert_srt_to_sjson(self):
+        """
+        Tests that the srt transcript is successfully converted into sjson format.
+        """
+        expected = self.sjson_transcript.decode('utf-8')
+        actual = Transcript.convert(self.srt_transcript, 'srt', 'sjson')
+        self.assertDictEqual(json.loads(actual), json.loads(expected))
+
+    def test_convert_invalid_srt_to_sjson(self):
+        """
+        Tests that TranscriptsGenerationException was raises on trying
+        to convert invalid srt transcript to sjson.
+        """
+        invalid_srt_transcript = 'invalid SubRip file content'
+        with self.assertRaises(TranscriptsGenerationException):
+            Transcript.convert(invalid_srt_transcript, 'srt', 'sjson')
diff --git a/edxval/transcript_utils.py b/edxval/transcript_utils.py
new file mode 100644
index 00000000..058158e7
--- /dev/null
+++ b/edxval/transcript_utils.py
@@ -0,0 +1,117 @@
+"""
+A module containing transcripts utils.
+"""
+import json
+from six import text_type
+
+from pysrt import SubRipFile, SubRipItem, SubRipTime
+from pysrt.srtexc import Error
+
+from edxval.exceptions import TranscriptsGenerationException
+
+
+class Transcript(object):
+    """
+    Container for transcript methods.
+    """
+    SRT = 'srt'
+    SJSON = 'sjson'
+
+    @staticmethod
+    def generate_sjson_from_srt(srt_subs):
+        """
+        Generate transcripts from sjson to SubRip (*.srt).
+
+        Arguments:
+            srt_subs(SubRip): "SRT" subs object
+
+        Returns:
+            Subs converted to "SJSON" format.
+        """
+        sub_starts = []
+        sub_ends = []
+        sub_texts = []
+        for sub in srt_subs:
+            sub_starts.append(sub.start.ordinal)
+            sub_ends.append(sub.end.ordinal)
+            sub_texts.append(sub.text.replace('\n', ' '))
+
+        sjson_subs = {
+            'start': sub_starts,
+            'end': sub_ends,
+            'text': sub_texts
+        }
+        return sjson_subs
+
+    @staticmethod
+    def generate_srt_from_sjson(sjson_subs):
+        """
+        Generate transcripts from sjson to SubRip (*.srt).
+
+        Arguments:
+            sjson_subs (dict): `sjson` subs.
+
+        Returns:
+            Subtitles in SRT format.
+        """
+
+        output = ''
+
+        equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text'])
+        if not equal_len:
+            return output
+
+        for i in range(len(sjson_subs['start'])):
+            item = SubRipItem(
+                index=i,
+                start=SubRipTime(milliseconds=sjson_subs['start'][i]),
+                end=SubRipTime(milliseconds=sjson_subs['end'][i]),
+                text=sjson_subs['text'][i]
+            )
+            output += (unicode(item))
+            output += '\n'
+        return output
+
+    @classmethod
+    def convert(cls, content, input_format, output_format):
+        """
+        Convert transcript `content` from `input_format` to `output_format`.
+
+        Arguments:
+            content: Transcript content byte-stream.
+            input_format: Input transcript format.
+            output_format: Output transcript format.
+
+        Accepted input formats: sjson, srt.
+        Accepted output format: srt, sjson.
+
+        Raises:
+            TranscriptsGenerationException: On parsing the invalid srt
+            content during conversion from srt to sjson.
+        """
+        assert input_format in ('srt', 'sjson')
+        assert output_format in ('srt', 'sjson')
+
+        # Decode the content with utf-8-sig which will also
+        # skip byte order mark(BOM) character if found.
+        content = content.decode('utf-8-sig')
+
+        if input_format == output_format:
+            return content
+
+        if input_format == 'srt':
+
+            if output_format == 'sjson':
+                try:
+                    # With error handling (set to 'ERROR_RAISE'), we will be getting
+                    # the exception if something went wrong in parsing the transcript.
+                    srt_subs = SubRipFile.from_string(content, error_handling=SubRipFile.ERROR_RAISE)
+                except Error as ex:  # Base exception from pysrt
+                    raise TranscriptsGenerationException(text_type(ex))
+
+                return json.dumps(cls.generate_sjson_from_srt(srt_subs))
+
+        if input_format == 'sjson':
+
+            if output_format == 'srt':
+                return cls.generate_srt_from_sjson(json.loads(content))
diff --git a/edxval/utils.py b/edxval/utils.py
index 795c0344..10c5e56f 100644
--- a/edxval/utils.py
+++ b/edxval/utils.py
@@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir):
     Arguments:
         file_data (str): Data to store into the file.
         file_name (str): File name of the file to be created.
-        resource_fs (OSFS): Import file system.
+        file_system (OSFS): Import file system.
         static_dir (str): The Directory to retrieve transcript file.
     """
     with file_system.open(combine(static_dir, file_name), 'wb') as f:
-        f.write(file_data)
+        f.write(file_data.encode('utf-8'))
 
 
 def get_transcript_format(transcript_content):
diff --git a/setup.py b/setup.py
index 5e019c7f..23a0dc2b 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ def load_requirements(*requirements_paths):
 
 setup(
     name='edxval',
-    version='0.1.14',
+    version='0.1.15',
     author='edX',
     url='http://github.com/edx/edx-val',
     description='edx-val',