Merge pull request #138 from edx/mrehan/backward-transcript-export

Backward compatible video transcripts export
openedx · May 25, 2018 · 2830e2e · 2830e2e
2 parents f7a077b + 0bbb8af
commit 2830e2e
Show file tree

Hide file tree

Showing 7 changed files with 327 additions and 65 deletions.
diff --git a/edxval/api.py b/edxval/api.py
@@ -3,6 +3,7 @@
 """
 The internal API for VAL.
 """
+import os
 import logging
 from enum import Enum
 from uuid import uuid4
@@ -17,15 +18,35 @@
 from lxml.etree import Element, SubElement
 from pysrt.srtexc import Error
 
-from edxval.exceptions import (InvalidTranscriptFormat,
-                               InvalidTranscriptProvider, ValCannotCreateError,
-                               ValCannotUpdateError, ValInternalError,
-                               ValVideoNotFoundError)
-from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
-                           TranscriptProviderType, Video, VideoImage,
-                           VideoTranscript, ThirdPartyTranscriptCredentialsState)
+from edxval.exceptions import (
+    InvalidTranscriptFormat,
+    TranscriptsGenerationException,
+    InvalidTranscriptProvider,
+    ValCannotCreateError,
+    ValCannotUpdateError,
+    ValInternalError,
+    ValVideoNotFoundError,
+)
+from edxval.models import (
+    CourseVideo,
+    EncodedVideo,
+    Profile,
+    TranscriptPreference,
+    TranscriptProviderType,
+    Video,
+    VideoImage,
+    VideoTranscript,
+    ThirdPartyTranscriptCredentialsState,
+)
 from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
-from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
+from edxval.utils import (
+    TranscriptFormat,
+    THIRD_PARTY_TRANSCRIPTION_PLANS,
+    create_file_in_fs,
+    get_transcript_format,
+)
+
+from edxval.transcript_utils import Transcript
 
 
 logger = logging.getLogger(__name__)  # pylint: disable=C0103
@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
                 for name in ['profile', 'url', 'file_size', 'bitrate']
             }
         )
+
     return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)
 
 
@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
         static_dir (str): The Directory to store transcript file.
         resource_fs (SubFS): The file system to store transcripts.
     """
-    transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
+    transcript_filename = '{video_id}-{language_code}.srt'.format(
         video_id=video_id,
-        language_code=language_code,
-        file_format=file_format
+        language_code=language_code
     )
     transcript_data = get_video_transcript_data(video_id, language_code)
     if transcript_data:
-        transcript_content = transcript_data['content']
-        create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
+        transcript_content = Transcript.convert(
+            transcript_data['content'],
+            input_format=file_format,
+            output_format=Transcript.SRT
+        )
+        create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)
+
+    return transcript_filename
 
 
 def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
     """
     Creates xml for transcripts.
-    For each transcript elment, an associated transcript file is also created in course OLX.
+    For each transcript element, an associated transcript file is also created in course OLX.
 
     Arguments:
         video_id (str): Video id of the video.
@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
     if video_transcripts.exists():
         transcripts_el = SubElement(video_el, 'transcripts')
 
-    exported_language_codes = []
+    transcript_files_map = {}
     for video_transcript in video_transcripts:
-        if video_transcript.language_code not in exported_language_codes:
-            language_code = video_transcript.language_code
-            file_format = video_transcript.file_format
+        language_code = video_transcript.language_code
+        file_format = video_transcript.file_format
 
-            create_transcript_file(
-                video_id,
-                language_code,
-                file_format,
-                resource_fs.delegate_fs(),
-                combine(u'course', static_dir)  # File system should not start from /draft directory.
+        try:
+            transcript_filename = create_transcript_file(
+                video_id=video_id,
+                language_code=language_code,
+                file_format=file_format,
+                resource_fs=resource_fs.delegate_fs(),
+                static_dir=combine(u'course', static_dir)  # File system should not start from /draft directory.
             )
+            transcript_files_map[language_code] = transcript_filename
+        except TranscriptsGenerationException:
+            # we don't want to halt export in this case, just log and move to the next transcript.
+            logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
+            continue
 
-            SubElement(
-                transcripts_el,
-                'transcript',
-                {
-                    'language_code': language_code,
-                    'file_format': file_format,
-                    'provider': video_transcript.provider,
-                }
-            )
-            exported_language_codes.append(video_transcript.language_code)
+        SubElement(
+            transcripts_el,
+            'transcript',
+            {
+                'language_code': language_code,
+                'file_format': Transcript.SRT,
+                'provider': video_transcript.provider,
+            }
+        )
 
-    return video_el
+    return dict(xml=video_el, transcripts=transcript_files_map)
 
 
 def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
             )
             return
 
-
         # Get file format from transcript content.
         try:
             file_format = get_transcript_format(file_content)

diff --git a/edxval/exceptions.py b/edxval/exceptions.py
@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
     This error is raised when an transcript provider is not supported
     """
     pass
+
+
+class TranscriptsGenerationException(ValError):
+    """
+    This error is raised when a transcript content is not parse-able in specified format.
+    """
+    pass
diff --git a/edxval/tests/test_api.py b/edxval/tests/test_api.py
@@ -35,6 +35,7 @@
                            VideoTranscript)
 from edxval.serializers import VideoSerializer
 from edxval.tests import APIAuthTestCase, constants
+from edxval.transcript_utils import Transcript
 
 
 def omit_attrs(dict, attrs_to_omit=[]):
@@ -973,11 +974,14 @@ def test_no_encodings(self):
         expected = self.parse_xml("""
             <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
         """)
-        self.assert_xml_equal(
-            api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR),
-            expected
+        exported_metadata = api.export_to_xml(
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
+            video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
         )
 
+        self.assert_xml_equal(exported_metadata['xml'], expected)
+
     def test_no_video_transcript(self):
         """
         Verify that transcript export for video with no transcript is working as expected.
@@ -986,11 +990,12 @@ def test_no_video_transcript(self):
             <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
         """)
 
-        exported_xml = api.export_to_xml(
+        exported_metadata = api.export_to_xml(
             constants.VIDEO_DICT_STAR['edx_video_id'],
             self.file_system,
             constants.EXPORT_IMPORT_STATIC_DIR
         )
+        exported_xml = exported_metadata['xml']
         self.assert_xml_equal(exported_xml, expected)
 
         # Verify that no transcript is present in the XML.
@@ -1011,29 +1016,29 @@ def test_basic(self, course_id, image):
                 <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
                 <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
                 <transcripts>
-                    <transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
+                    <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
                     <transcript file_format="srt" language_code="en" provider="Cielo24" />
                 </transcripts>
             </video_asset>
         """.format(image=image))
 
-        self.assert_xml_equal(
-            api.export_to_xml(
-                constants.VIDEO_DICT_FISH['edx_video_id'],
-                self.file_system,
-                constants.EXPORT_IMPORT_STATIC_DIR,
-                course_id
-            ),
-            expected
+        exported_metadata = api.export_to_xml(
+            constants.VIDEO_DICT_FISH['edx_video_id'],
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            course_id
         )
 
+        self.assert_xml_equal(exported_metadata['xml'], expected)
+        self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])
+
     def test_transcript_export(self):
         """
         Test that transcript are exported correctly.
         """
         language_code = 'en'
         video_id = constants.VIDEO_DICT_FISH['edx_video_id']
-        transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
+        transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
         expected_transcript_path = combine(
             self.temp_dir,
             combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
@@ -1045,16 +1050,21 @@ def test_transcript_export(self):
                 <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
                 <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
                 <transcripts>
-                    <transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
+                    <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
                     <transcript file_format="srt" language_code="en" provider="Cielo24" />
                 </transcripts>
             </video_asset>
         """)
 
-        exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course')
+        exported_metadata = api.export_to_xml(
+            video_id=video_id,
+            course_id='test-course',
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+        )
 
         # Assert video and transcript xml is exported correctly.
-        self.assert_xml_equal(exported_xml, expected_xml)
+        self.assert_xml_equal(exported_metadata['xml'], expected_xml)
 
         # Verify transcript file is created.
         self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
@@ -1065,7 +1075,13 @@ def test_transcript_export(self):
                 open(combine(expected_transcript_path, transcript_files[language_code]))
             ).read()
             transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
-            self.assertEqual(transcript['content'], expected_transcript_content)
+            transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
+            exported_transcript_content = Transcript.convert(
+                transcript['content'],
+                input_format=transcript_format,
+                output_format=Transcript.SRT,
+            ).encode('utf-8')
+            self.assertEqual(exported_transcript_content, expected_transcript_content)
 
 
     def test_unknown_video(self):
@@ -1728,20 +1744,18 @@ def test_import_transcript_from_fs_bad_content(self, mock_logger):
         """
         language_code = 'en'
         edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
-        # First create transcript file.
+
+        # First create non utf-8 encoded transcript file in the file system.
         transcript_file_name = 'invalid-transcript.txt'
         invalid_transcript = dict(
             constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
             video_id=edx_video_id,
-            file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
-        )
-        utils.create_file_in_fs(
-            invalid_transcript['file_data'],
-            transcript_file_name,
-            self.file_system,
-            constants.EXPORT_IMPORT_STATIC_DIR
+            file_data=u'Привіт, edX вітає вас.'
         )
 
+        with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
+            f.write(invalid_transcript['file_data'].encode('cp1251'))
+
         api.import_transcript_from_fs(
             edx_video_id=edx_video_id,
             language_code=language_code,