More fixes

vespperhq · Sep 17, 2024 · 4308565 · 4308565
1 parent a4e753a
commit 4308565
Show file tree

Hide file tree

Showing 16 changed files with 874 additions and 98 deletions.
diff --git a/services/api/src/agent/tools/static/semantic_search.ts b/services/api/src/agent/tools/static/semantic_search.ts
@@ -88,6 +88,7 @@ export default async function (context: RunContext) {
                   title = "PagerDuty Alert";
                   break;
                 }
+                case "Jira":
                 case "Confluence": {
                   url = document.metadata.url;
                   title = document.metadata.title;

diff --git a/services/data-processor/src/loaders/confluence.py b/services/data-processor/src/loaders/confluence.py
@@ -1,7 +1,7 @@
 from collections import namedtuple
 import os
 import requests
-from loaders.raw_readers.confluence import ConfluenceReader
+from loaders.readers.confluence import ConfluenceReader
 from atlassian import Confluence
 
 from db.types import Integration

diff --git a/services/data-processor/src/loaders/github.py b/services/data-processor/src/loaders/github.py
@@ -2,13 +2,13 @@
 from github import Github, Auth, GithubException
 
 # from llama_index.core import SimpleDirectoryReader
-from llama_index.readers.github.repository.github_client import GithubClient
+from loaders.utils.github_client import GithubClient
 from llama_index.readers.github import (
     GitHubIssuesClient,
 )
 from db.types import Integration
-from loaders.raw_readers.github_repo import GithubRepositoryReader
-from loaders.raw_readers.github_issues import GitHubRepositoryIssuesReader
+from loaders.readers.github_repo import GithubRepositoryReader
+from loaders.readers.github_issues import GitHubRepositoryIssuesReader
 
 
 def get_repos(token: str, repos_to_sync=None):
@@ -70,6 +70,8 @@ async def fetch_github_documents(
             # # TODO: this can crash if the repo is huge, because of Github API Rate limit.
             # # Need to find a way to "wait" maybe or to filter garbage.
             code_client = GithubClient(token, fail_on_http_error=False, verbose=True)
+
+            # TODO: updated_at timestamp doesn't seem to work (our code treats same docs as new)
             loader = GithubRepositoryReader(
                 github_client=code_client,
                 owner=owner,

diff --git a/services/data-processor/src/loaders/jira.py b/services/data-processor/src/loaders/jira.py
@@ -1,7 +1,11 @@
 import requests
-from llama_index.readers.jira import JiraReader
+from datetime import datetime, timezone
+from dateutil import parser
+from loaders.readers.jira import JiraReader
 from db.types import Integration
 
+JQL_QUERY = "issuetype is not EMPTY"
+
 
 def fetch_jira_documents(integration: Integration):
     integration_type = integration.type
@@ -19,9 +23,7 @@ def fetch_jira_documents(integration: Integration):
             loader = JiraReader(
                 Oauth2={"cloud_id": cloud_id, "api_token": access_token}
             )
-            documents = loader.load_data(
-                "issuetype is not EMPTY"
-            )  # This "should" fetch all issues
+            documents = loader.load_data(JQL_QUERY)  # This "should" fetch all issues
             total_documents.extend(documents)
     else:
         loader = JiraReader(
@@ -31,12 +33,24 @@ def fetch_jira_documents(integration: Integration):
                 "server_url": integration.metadata["site_url"],
             }
         )
-        documents = loader.load_data("issuetype is not EMPTY")
+        documents = loader.load_data(JQL_QUERY)
         total_documents.extend(documents)
 
     # Adding the global "source" metadata field
     for document in total_documents:
         document.metadata.pop("labels", None)
         document.metadata["source"] = "Jira"
 
-    return documents
+        # Transform 'created_at' and 'updated_at' to UTC with milliseconds
+        created_at = parser.isoparse(document.metadata["created_at"])
+        updated_at = parser.isoparse(document.metadata["updated_at"])
+        document.metadata["created_at"] = (
+            created_at.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]
+            + "Z"
+        )
+        document.metadata["updated_at"] = (
+            updated_at.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]
+            + "Z"
+        )
+
+    return total_documents
diff --git a/services/data-processor/src/loaders/notion.py b/services/data-processor/src/loaders/notion.py
@@ -1,6 +1,6 @@
 from db.types import Integration
 from notion_client import Client
-from loaders.raw_readers.notion import NotionPageReader
+from loaders.readers.notion import NotionPageReader
 
 
 def fetch_notion_documents(integration: Integration):

diff --git a/services/data-processor/src/loaders/pagerduty.py b/services/data-processor/src/loaders/pagerduty.py
@@ -1,83 +1,11 @@
 from db.types import Integration
-import httpx
-from llama_index.core import Document
-
-INCIDENT_TEXT_TEMPLATE = """
-Incident title: {title}
-Incident description: {description}
-Incident summary: {summary}
-Incident status: {status}
-Service name: {service_name}
-Created at: {created_at}
-"""
-
-
-async def get_incidents(integration: Integration):
-    access_token = integration.credentials["access_token"]
-    integration_type = integration.type
-    headers = {}
-    if integration_type == "basic":
-        headers["Authorization"] = f"Token token={access_token}"
-    elif integration_type == "oauth":
-        headers["Authorization"] = f"Bearer {access_token}"
-    else:
-        raise ValueError(f"Invalid integration type: {integration_type}")
-
-    limit = 100
-    offset = 0
-    resolved_incidents = []
-    while True:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                "https://api.pagerduty.com/incidents",
-                headers=headers,
-                params={
-                    "date_range": "all",
-                    "statuses[]": "resolved",
-                    "limit": limit,
-                    "offset": offset,
-                },
-            )
-            data = response.json()
-            incidents = data["incidents"]
-            resolved_incidents.extend(incidents)
-            if not data["more"]:
-                break
-            offset += limit
-    return resolved_incidents
+from loaders.readers.pagerduty import PagerDutyReader
 
 
 async def fetch_pagerduty_documents(integration: Integration):
-    incidents = await get_incidents(integration)
-
-    documents = []
-    for incident in incidents:
-        service = incident.get("service", {})
-        service_name = service.get("summary", "Unknown")
-
-        text = INCIDENT_TEXT_TEMPLATE.format(
-            title=incident["title"],
-            description=incident["description"],
-            summary=incident["summary"],
-            status=incident["status"],
-            service_name=service_name,
-            created_at=incident["created_at"],
-        )
-        metadata = {
-            "source": "PagerDuty",
-            "id": incident["id"],
-            "link": incident["html_url"],
-            "status": incident["status"],
-            "urgency": incident["urgency"],
-            "service_id": service.get("id", "Unknown"),
-            "first_trigger_log_entry_id": incident.get(
-                "first_trigger_log_entry", {}
-            ).get("id", "Unknown"),
-            "created_at": incident["created_at"],
-            "updated_at": incident["updated_at"],
-        }
-
-        document = Document(doc_id=incident["id"], text=text, metadata=metadata)
-        documents.append(document)
+    access_token = integration.credentials["access_token"]
+    token_type = integration.type
+    loader = PagerDutyReader(access_token, token_type)
+    documents = await loader.load_data()
 
     return documents
diff --git a/...ocessor/src/loaders/raw_readers/README.md → ...a-processor/src/loaders/readers/README.md b/...ocessor/src/loaders/raw_readers/README.md → ...a-processor/src/loaders/readers/README.md
diff --git a/...sor/src/loaders/raw_readers/confluence.py → ...ocessor/src/loaders/readers/confluence.py b/...sor/src/loaders/raw_readers/confluence.py → ...ocessor/src/loaders/readers/confluence.py
diff --git a/.../src/loaders/raw_readers/github_issues.py → ...ssor/src/loaders/readers/github_issues.py b/.../src/loaders/raw_readers/github_issues.py → ...ssor/src/loaders/readers/github_issues.py
@@ -183,6 +183,7 @@ async def load_data(
                 extra_info = {
                     "state": issue["state"],
                     "created_at": issue["created_at"],
+                    "updated_at": issue["updated_at"],
                     # url is the API URL
                     "url": issue["url"],
                     # source is the HTML URL, more convenient for humans

diff --git a/...or/src/loaders/raw_readers/github_repo.py → ...cessor/src/loaders/readers/github_repo.py b/...or/src/loaders/raw_readers/github_repo.py → ...cessor/src/loaders/readers/github_repo.py
@@ -446,6 +446,10 @@ async def _recurse_tree(
             )
         return blobs_and_full_paths
 
+    async def _get_latest_commit(self, path) -> str:
+        commits = await self._github_client.get_commits(self._owner, self._repo, path)
+        return commits[0]
+
     async def _generate_documents(
         self,
         blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]],
@@ -472,6 +476,7 @@ async def _generate_documents(
         documents = []
         async for blob_data, full_path in buffered_iterator:
             print_if_verbose(self._verbose, f"generating document for {full_path}")
+            latest_commit = await self._get_latest_commit(full_path)
             assert (
                 blob_data.encoding == "base64"
             ), f"blob encoding {blob_data.encoding} not supported"
@@ -525,6 +530,7 @@ async def _generate_documents(
                     "file_path": full_path,
                     "file_name": full_path.split("/")[-1],
                     "url": url,
+                    "updated_at": latest_commit.commit.author.date,
                 },
             )
             documents.append(document)

diff --git a/services/data-processor/src/loaders/readers/jira.py b/services/data-processor/src/loaders/readers/jira.py
@@ -0,0 +1,117 @@
+from typing import List, Optional, TypedDict
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class BasicAuth(TypedDict):
+    email: str
+    api_token: str
+    server_url: str
+
+
+class Oauth2(TypedDict):
+    cloud_id: str
+    api_token: str
+
+
+class JiraReader(BaseReader):
+    """Jira reader. Reads data from Jira issues from passed query.
+
+    Args:
+        Optional basic_auth:{
+            "email": "email",
+            "api_token": "token",
+            "server_url": "server_url"
+        }
+        Optional oauth:{
+            "cloud_id": "cloud_id",
+            "api_token": "token"
+        }
+    """
+
+    def __init__(
+        self,
+        email: Optional[str] = None,
+        api_token: Optional[str] = None,
+        server_url: Optional[str] = None,
+        BasicAuth: Optional[BasicAuth] = None,
+        Oauth2: Optional[Oauth2] = None,
+    ) -> None:
+        from jira import JIRA
+
+        if email and api_token and server_url:
+            if BasicAuth is None:
+                BasicAuth = {}
+            BasicAuth["email"] = email
+            BasicAuth["api_token"] = api_token
+            BasicAuth["server_url"] = server_url
+
+        if Oauth2:
+            options = {
+                "server": f"https://api.atlassian.com/ex/jira/{Oauth2['cloud_id']}",
+                "headers": {"Authorization": f"Bearer {Oauth2['api_token']}"},
+            }
+            self.jira = JIRA(options=options)
+        else:
+            self.jira = JIRA(
+                basic_auth=(BasicAuth["email"], BasicAuth["api_token"]),
+                server=f"https://{BasicAuth['server_url']}",
+            )
+
+    def load_data(self, query: str) -> List[Document]:
+        relevant_issues = self.jira.search_issues(query)
+
+        issues = []
+
+        assignee = ""
+        reporter = ""
+        epic_key = ""
+        epic_summary = ""
+        epic_descripton = ""
+
+        for issue in relevant_issues:
+            # Iterates through only issues and not epics
+            if "parent" in (issue.raw["fields"]):
+                if issue.fields.assignee:
+                    assignee = issue.fields.assignee.displayName
+
+                if issue.fields.reporter:
+                    reporter = issue.fields.reporter.displayName
+
+                if issue.raw["fields"]["parent"]["key"]:
+                    epic_key = issue.raw["fields"]["parent"]["key"]
+
+                if issue.raw["fields"]["parent"]["fields"]["summary"]:
+                    epic_summary = issue.raw["fields"]["parent"]["fields"]["summary"]
+
+                if issue.raw["fields"]["parent"]["fields"]["status"]["description"]:
+                    epic_descripton = issue.raw["fields"]["parent"]["fields"]["status"][
+                        "description"
+                    ]
+
+            issues.append(
+                Document(
+                    text=f"{issue.fields.summary} \n {issue.fields.description}",
+                    doc_id=issue.id,
+                    extra_info={
+                        "id": issue.id,
+                        "title": issue.fields.summary,
+                        "url": issue.permalink(),
+                        "created_at": issue.fields.created,
+                        "updated_at": issue.fields.updated,
+                        "labels": issue.fields.labels,
+                        "status": issue.fields.status.name,
+                        "assignee": assignee,
+                        "reporter": reporter,
+                        "project": issue.fields.project.name,
+                        "issue_type": issue.fields.issuetype.name,
+                        "priority": issue.fields.priority.name,
+                        "epic_key": epic_key,
+                        "epic_summary": epic_summary,
+                        "epic_description": epic_descripton,
+                    },
+                )
+            )
+
+        return issues