feat: Add YouTube transcript extraction component and frontend integr…

…ation (langflow-ai#4502) * add new youtube transcripts component * [autofix.ci] apply automated fixes * ✨ (youtube-transcripts.spec.ts): add integration test for youtube transcripts component in the frontend to ensure user can interact with it successfully * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
headlinevc · Nov 26, 2024 · 86d7e20 · 86d7e20
1 parent 0d4b010
commit 86d7e20
Show file tree

Hide file tree

Showing 8 changed files with 9,946 additions and 0 deletions.
diff --git a/src/backend/base/langflow/components/tools/__init__.py b/src/backend/base/langflow/components/tools/__init__.py
@@ -19,6 +19,7 @@
 from .wikipedia_api import WikipediaAPIComponent
 from .wolfram_alpha_api import WolframAlphaAPIComponent
 from .yahoo_finance import YfinanceToolComponent
+from .youtube_transcripts import YouTubeTranscriptsComponent
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", LangChainDeprecationWarning)
@@ -45,4 +46,5 @@
     "WikipediaAPIComponent",
     "WolframAlphaAPIComponent",
     "YfinanceToolComponent",
+    "YouTubeTranscriptsComponent",
 ]
diff --git a/src/backend/base/langflow/components/tools/youtube_transcripts.py b/src/backend/base/langflow/components/tools/youtube_transcripts.py
@@ -0,0 +1,173 @@
+from langchain.tools import StructuredTool
+from langchain_community.document_loaders import YoutubeLoader
+from langchain_community.document_loaders.youtube import TranscriptFormat
+from langchain_core.tools import ToolException
+from pydantic import BaseModel, Field
+
+from langflow.base.langchain_utilities.model import LCToolComponent
+from langflow.field_typing import Tool
+from langflow.inputs import DropdownInput, IntInput, MultilineInput
+from langflow.schema import Data
+from langflow.template import Output
+
+
+class YoutubeApiSchema(BaseModel):
+    """Schema to define the input structure for the tool."""
+
+    url: str = Field(..., description="The YouTube URL to get transcripts from.")
+    transcript_format: TranscriptFormat = Field(
+        TranscriptFormat.TEXT,
+        description="The format of the transcripts. Either 'text' for a single "
+        "text output or 'chunks' for timestamped chunks.",
+    )
+    chunk_size_seconds: int = Field(
+        120,
+        description="The size of each transcript chunk in seconds. Only "
+        "applicable when 'Transcript Format' is set to 'chunks'.",
+    )
+    language: str = Field(
+        "",
+        description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
+    )
+    translation: str = Field(
+        "", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
+    )
+
+
+class YouTubeTranscriptsComponent(LCToolComponent):
+    """A component that extracts spoken content from YouTube videos as transcripts."""
+
+    display_name: str = "YouTube Transcripts"
+    description: str = "Extracts spoken content from YouTube videos as transcripts."
+    icon: str = "YouTube"
+
+    inputs = [
+        MultilineInput(
+            name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
+        ),
+        DropdownInput(
+            name="transcript_format",
+            display_name="Transcript Format",
+            options=["text", "chunks"],
+            value="text",
+            info="The format of the transcripts. Either 'text' for a single output "
+            "or 'chunks' for timestamped chunks.",
+        ),
+        IntInput(
+            name="chunk_size_seconds",
+            display_name="Chunk Size (seconds)",
+            value=60,
+            advanced=True,
+            info="The size of each transcript chunk in seconds. Only applicable when "
+            "'Transcript Format' is set to 'chunks'.",
+        ),
+        MultilineInput(
+            name="language",
+            display_name="Language",
+            info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
+        ),
+        DropdownInput(
+            name="translation",
+            display_name="Translation Language",
+            advanced=True,
+            options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"],
+            info="Translate the transcripts to the specified language. " "Leave empty for no translation.",
+        ),
+    ]
+
+    outputs = [
+        Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
+        Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
+    ]
+
+    def build_youtube_transcripts(self) -> Data | list[Data]:
+        """Method to build transcripts from the provided YouTube URL.
+
+        Returns:
+            Data | list[Data]: The transcripts of the video, either as a single
+            Data object or a list of Data objects.
+        """
+        try:
+            loader = YoutubeLoader.from_youtube_url(
+                self.url,
+                transcript_format=TranscriptFormat.TEXT
+                if self.transcript_format == "text"
+                else TranscriptFormat.CHUNKS,
+                chunk_size_seconds=self.chunk_size_seconds,
+                language=self.language.split(",") if self.language else ["en"],
+                translation=self.translation if self.translation else None,
+            )
+
+            transcripts = loader.load()
+
+            if self.transcript_format == "text":
+                # Extract only the page_content from the Document
+                return Data(data={"transcripts": transcripts[0].page_content})
+            # For chunks, extract page_content and metadata separately
+            return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
+
+        except Exception as exc:  # noqa: BLE001
+            # Using a specific error type for the return value
+            return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})
+
+    def youtube_transcripts(
+        self,
+        url: str = "",
+        transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
+        chunk_size_seconds: int = 120,
+        language: str = "",
+        translation: str = "",
+    ) -> Data | list[Data]:
+        """Helper method to handle transcripts outside of component calls.
+
+        Args:
+            url: The YouTube URL to get transcripts from.
+            transcript_format: Format of transcripts ('text' or 'chunks').
+            chunk_size_seconds: Size of each transcript chunk in seconds.
+            language: Comma-separated list of language codes.
+            translation: Target language for translation.
+
+        Returns:
+            Data | list[Data]: Video transcripts as single Data or list of Data.
+        """
+        try:
+            if isinstance(transcript_format, str):
+                transcript_format = TranscriptFormat(transcript_format)
+            loader = YoutubeLoader.from_youtube_url(
+                url,
+                transcript_format=TranscriptFormat.TEXT
+                if transcript_format == TranscriptFormat.TEXT
+                else TranscriptFormat.CHUNKS,
+                chunk_size_seconds=chunk_size_seconds,
+                language=language.split(",") if language else ["en"],
+                translation=translation if translation else None,
+            )
+
+            transcripts = loader.load()
+            if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
+                return Data(data={"transcript": transcripts[0].page_content})
+            return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
+        except Exception as exc:
+            msg = f"Failed to get YouTube transcripts: {exc!s}"
+            raise ToolException(msg) from exc
+
+    def build_youtube_tool(self) -> Tool:
+        """Method to build the transcripts tool.
+
+        Returns:
+            Tool: A structured tool that uses the transcripts method.
+
+        Raises:
+            RuntimeError: If tool creation fails.
+        """
+        try:
+            return StructuredTool.from_function(
+                name="youtube_transcripts",
+                description="Get transcripts from YouTube videos.",
+                func=self.youtube_transcripts,
+                args_schema=YoutubeApiSchema,
+            )
+
+        except Exception as exc:
+            msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
+            raise RuntimeError(msg) from exc
diff --git a/src/frontend/src/icons/Youtube/index.tsx b/src/frontend/src/icons/Youtube/index.tsx
@@ -0,0 +1,9 @@
+import React, { forwardRef } from "react";
+import YouTubeIcon from "./youtube";
+
+export const YouTubeSvgIcon = forwardRef<
+  SVGSVGElement,
+  React.PropsWithChildren<{}>
+>((props, ref) => {
+  return <YouTubeIcon ref={ref} {...props} />;
+});