Skip to content

Commit

Permalink
Merge pull request #138 from edx/mrehan/backward-transcript-export
Browse files Browse the repository at this point in the history
Backward compatible video transcripts export
  • Loading branch information
Qubad786 authored May 25, 2018
2 parents f7a077b + 0bbb8af commit 2830e2e
Show file tree
Hide file tree
Showing 7 changed files with 327 additions and 65 deletions.
102 changes: 66 additions & 36 deletions edxval/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
The internal API for VAL.
"""
import os
import logging
from enum import Enum
from uuid import uuid4
Expand All @@ -17,15 +18,35 @@
from lxml.etree import Element, SubElement
from pysrt.srtexc import Error

from edxval.exceptions import (InvalidTranscriptFormat,
InvalidTranscriptProvider, ValCannotCreateError,
ValCannotUpdateError, ValInternalError,
ValVideoNotFoundError)
from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
TranscriptProviderType, Video, VideoImage,
VideoTranscript, ThirdPartyTranscriptCredentialsState)
from edxval.exceptions import (
InvalidTranscriptFormat,
TranscriptsGenerationException,
InvalidTranscriptProvider,
ValCannotCreateError,
ValCannotUpdateError,
ValInternalError,
ValVideoNotFoundError,
)
from edxval.models import (
CourseVideo,
EncodedVideo,
Profile,
TranscriptPreference,
TranscriptProviderType,
Video,
VideoImage,
VideoTranscript,
ThirdPartyTranscriptCredentialsState,
)
from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
from edxval.utils import (
TranscriptFormat,
THIRD_PARTY_TRANSCRIPTION_PLANS,
create_file_in_fs,
get_transcript_format,
)

from edxval.transcript_utils import Transcript


logger = logging.getLogger(__name__) # pylint: disable=C0103
Expand Down Expand Up @@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
for name in ['profile', 'url', 'file_size', 'bitrate']
}
)

return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)


Expand All @@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
static_dir (str): The Directory to store transcript file.
resource_fs (SubFS): The file system to store transcripts.
"""
transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
transcript_filename = '{video_id}-{language_code}.srt'.format(
video_id=video_id,
language_code=language_code,
file_format=file_format
language_code=language_code
)
transcript_data = get_video_transcript_data(video_id, language_code)
if transcript_data:
transcript_content = transcript_data['content']
create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
transcript_content = Transcript.convert(
transcript_data['content'],
input_format=file_format,
output_format=Transcript.SRT
)
create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)

return transcript_filename


def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
"""
Creates xml for transcripts.
For each transcript elment, an associated transcript file is also created in course OLX.
For each transcript element, an associated transcript file is also created in course OLX.
Arguments:
video_id (str): Video id of the video.
Expand All @@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
if video_transcripts.exists():
transcripts_el = SubElement(video_el, 'transcripts')

exported_language_codes = []
transcript_files_map = {}
for video_transcript in video_transcripts:
if video_transcript.language_code not in exported_language_codes:
language_code = video_transcript.language_code
file_format = video_transcript.file_format
language_code = video_transcript.language_code
file_format = video_transcript.file_format

create_transcript_file(
video_id,
language_code,
file_format,
resource_fs.delegate_fs(),
combine(u'course', static_dir) # File system should not start from /draft directory.
try:
transcript_filename = create_transcript_file(
video_id=video_id,
language_code=language_code,
file_format=file_format,
resource_fs=resource_fs.delegate_fs(),
static_dir=combine(u'course', static_dir) # File system should not start from /draft directory.
)
transcript_files_map[language_code] = transcript_filename
except TranscriptsGenerationException:
# we don't want to halt export in this case, just log and move to the next transcript.
logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
continue

SubElement(
transcripts_el,
'transcript',
{
'language_code': language_code,
'file_format': file_format,
'provider': video_transcript.provider,
}
)
exported_language_codes.append(video_transcript.language_code)
SubElement(
transcripts_el,
'transcript',
{
'language_code': language_code,
'file_format': Transcript.SRT,
'provider': video_transcript.provider,
}
)

return video_el
return dict(xml=video_el, transcripts=transcript_files_map)


def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
Expand Down Expand Up @@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
)
return


# Get file format from transcript content.
try:
file_format = get_transcript_format(file_content)
Expand Down
7 changes: 7 additions & 0 deletions edxval/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
This error is raised when an transcript provider is not supported
"""
pass


class TranscriptsGenerationException(ValError):
"""
This error is raised when a transcript content is not parse-able in specified format.
"""
pass
66 changes: 40 additions & 26 deletions edxval/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
VideoTranscript)
from edxval.serializers import VideoSerializer
from edxval.tests import APIAuthTestCase, constants
from edxval.transcript_utils import Transcript


def omit_attrs(dict, attrs_to_omit=[]):
Expand Down Expand Up @@ -973,11 +974,14 @@ def test_no_encodings(self):
expected = self.parse_xml("""
<video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
""")
self.assert_xml_equal(
api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR),
expected
exported_metadata = api.export_to_xml(
resource_fs=self.file_system,
static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
)

self.assert_xml_equal(exported_metadata['xml'], expected)

def test_no_video_transcript(self):
"""
Verify that transcript export for video with no transcript is working as expected.
Expand All @@ -986,11 +990,12 @@ def test_no_video_transcript(self):
<video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
""")

exported_xml = api.export_to_xml(
exported_metadata = api.export_to_xml(
constants.VIDEO_DICT_STAR['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR
)
exported_xml = exported_metadata['xml']
self.assert_xml_equal(exported_xml, expected)

# Verify that no transcript is present in the XML.
Expand All @@ -1011,29 +1016,29 @@ def test_basic(self, course_id, image):
<encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
<encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
<transcripts>
<transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="en" provider="Cielo24" />
</transcripts>
</video_asset>
""".format(image=image))

self.assert_xml_equal(
api.export_to_xml(
constants.VIDEO_DICT_FISH['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR,
course_id
),
expected
exported_metadata = api.export_to_xml(
constants.VIDEO_DICT_FISH['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR,
course_id
)

self.assert_xml_equal(exported_metadata['xml'], expected)
self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])

def test_transcript_export(self):
"""
Test that transcript are exported correctly.
"""
language_code = 'en'
video_id = constants.VIDEO_DICT_FISH['edx_video_id']
transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
expected_transcript_path = combine(
self.temp_dir,
combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
Expand All @@ -1045,16 +1050,21 @@ def test_transcript_export(self):
<encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
<encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
<transcripts>
<transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="en" provider="Cielo24" />
</transcripts>
</video_asset>
""")

exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course')
exported_metadata = api.export_to_xml(
video_id=video_id,
course_id='test-course',
resource_fs=self.file_system,
static_dir=constants.EXPORT_IMPORT_STATIC_DIR
)

# Assert video and transcript xml is exported correctly.
self.assert_xml_equal(exported_xml, expected_xml)
self.assert_xml_equal(exported_metadata['xml'], expected_xml)

# Verify transcript file is created.
self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
Expand All @@ -1065,7 +1075,13 @@ def test_transcript_export(self):
open(combine(expected_transcript_path, transcript_files[language_code]))
).read()
transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
self.assertEqual(transcript['content'], expected_transcript_content)
transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
exported_transcript_content = Transcript.convert(
transcript['content'],
input_format=transcript_format,
output_format=Transcript.SRT,
).encode('utf-8')
self.assertEqual(exported_transcript_content, expected_transcript_content)


def test_unknown_video(self):
Expand Down Expand Up @@ -1728,20 +1744,18 @@ def test_import_transcript_from_fs_bad_content(self, mock_logger):
"""
language_code = 'en'
edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
# First create transcript file.

# First create non utf-8 encoded transcript file in the file system.
transcript_file_name = 'invalid-transcript.txt'
invalid_transcript = dict(
constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
video_id=edx_video_id,
file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
)
utils.create_file_in_fs(
invalid_transcript['file_data'],
transcript_file_name,
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR
file_data=u'Привіт, edX вітає вас.'
)

with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
f.write(invalid_transcript['file_data'].encode('cp1251'))

api.import_transcript_from_fs(
edx_video_id=edx_video_id,
language_code=language_code,
Expand Down
Loading

0 comments on commit 2830e2e

Please sign in to comment.