Skip to content

Commit

Permalink
POC: Deep Search PDF to MD file conversion
Browse files Browse the repository at this point in the history
Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
  • Loading branch information
nerdalert committed Jul 11, 2024
1 parent 5b001fd commit 6ee74c7
Show file tree
Hide file tree
Showing 15 changed files with 638 additions and 99 deletions.
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,9 @@ IL_GRANITE_API=<GRANITE_HOST>
IL_GRANITE_MODEL_NAME=<GRANITE_MODEL_NAME>
IL_MERLINITE_API=<MERLINITE_HOST>
IL_MERLINITE_MODEL_NAME=<MERLINITE_MODEL_NAME>

DS_USERNAME=<DEEP_SEARCH_USER>
DS_API_KEY=<DEEP_SEARCH_API_KEY>
DS_HOST=<DEEP_SEARCH_HOST>
DS_PROJ_KEY=<DEEP_PROJECT_KEY>
DS_PROJ_NAME=<DEEP_PROJ_NAME>
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ npm-debug.log
.env
*.env
coverage
lib
taxonomy
config.yaml
generated
Expand Down
3 changes: 2 additions & 1 deletion src/app/api/auth/[...nextauth]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ const logger = winston.createLogger({
transports: [new winston.transports.Console(), new winston.transports.File({ filename: path.join(process.cwd(), 'auth.log') })]
});

const ORG = process.env.NEXT_PUBLIC_TAXONOMY_REPO_OWNER!;
// const ORG = process.env.NEXT_PUBLIC_TAXONOMY_REPO_OWNER!;
const ORG = 'instructlab-public';

const authOptions: NextAuthOptions = {
providers: [
Expand Down
125 changes: 125 additions & 0 deletions src/app/api/conversion/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
'use server';

import { NextResponse, NextRequest } from 'next/server';
import fetch from 'node-fetch';

export async function POST(req: NextRequest) {
const { repoUrl, documentNames } = await req.json();
const USERNAME = process.env.DS_USERNAME;
const API_KEY = process.env.DS_API_KEY;
const HOST = process.env.DS_HOST;
const PROJ_KEY = process.env.DS_PROJ_KEY;
const BRANCH = 'main';

if (!USERNAME || !API_KEY || !HOST || !PROJ_KEY) {
console.error('Missing environment variables');
return NextResponse.json({ error: 'Missing environment variables' }, { status: 500 });
}

const pdfFileName = documentNames.find((name) => name.endsWith('.pdf'));
if (!pdfFileName) {
console.error('No PDF file found for conversion');
return NextResponse.json({ error: 'No PDF file found for conversion' }, { status: 400 });
}

const [repoOwner, repoName] = repoUrl.replace('https://github.com/', '').split('/');
const PDF_URL = `https://raw.githubusercontent.com/${repoOwner}/${repoName}/${BRANCH}/${pdfFileName}`;
console.log(`PDF URL for conversion: ${PDF_URL}`);

try {
console.log('Starting authentication...');
const authResponse = await fetch(`${HOST}/api/cps/user/v1/user/token`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Basic ${Buffer.from(`${USERNAME}:${API_KEY}`).toString('base64')}`
},
body: JSON.stringify({})
});

if (!authResponse.ok) {
const error = await authResponse.text();
console.error('Error during authentication:', error);
return NextResponse.json({ error }, { status: authResponse.status });
}

const authData = await authResponse.json();
const token = authData.access_token;
console.log('Authentication successful. Token obtained.');

console.log('Starting PDF conversion...');
const convertResponse = await fetch(`${HOST}/api/cps/public/v2/project/${PROJ_KEY}/convert`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: token
},
body: JSON.stringify({
http_source: { url: PDF_URL, headers: {} }
})
});

if (!convertResponse.ok) {
const error = await convertResponse.text();
console.error('Error during PDF conversion:', error);
return NextResponse.json({ error }, { status: convertResponse.status });
}

const convertData = await convertResponse.json();
const taskId = convertData.task_id;
console.log(`PDF conversion started. Task ID: ${taskId}`);

console.log('Checking conversion task status...');
let taskStatus;
while (true) {
const taskResponse = await fetch(`${HOST}/api/cps/public/v2/project/${PROJ_KEY}/convert_tasks/${taskId}?wait=10`, {
method: 'GET',
headers: {
Authorization: token
}
});

if (!taskResponse.ok) {
const error = await taskResponse.text();
console.error('Error during task status check:', error);
return NextResponse.json({ error }, { status: taskResponse.status });
}

const taskText = await taskResponse.text();
try {
taskStatus = JSON.parse(taskText);
} catch (parseError) {
console.error('Error parsing task status response:', taskText);
return NextResponse.json({ error: 'Failed to parse task status response' }, { status: 500 });
}

console.log(`Task status: ${taskStatus.task_status}`);

if (taskStatus.result && ['SUCCESS', 'FAILURE'].includes(taskStatus.task_status)) {
break;
}
await new Promise((resolve) => setTimeout(resolve, 10000)); // Wait for 10 seconds before polling again
}

if (taskStatus.task_status === 'FAILURE') {
console.error('PDF Conversion Task failed.');
return NextResponse.json({ error: 'PDF Conversion Task failed' }, { status: 500 });
}

const result = {
json_file_url: taskStatus.result.json_file_url,
md_file_url: taskStatus.result.md_file_url,
document_hash: taskStatus.result.document_hash
};

console.log('Task completed successfully.');
console.log(`JSON file URL: ${result.json_file_url}`);
console.log(`Markdown file URL: ${result.md_file_url}`);
console.log(`Document hash: ${result.document_hash}`);

return NextResponse.json(result);
} catch (error) {
console.error('Unexpected error:', error);
return NextResponse.json({ error: error.message }, { status: 500 });
}
}
2 changes: 1 addition & 1 deletion src/app/api/pr/knowledge/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down
2 changes: 1 addition & 1 deletion src/app/api/pr/skill/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down
19 changes: 7 additions & 12 deletions src/app/api/upload/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import { getToken } from 'next-auth/jwt';
import { NextRequest } from 'next/server';

const GITHUB_API_URL = 'https://api.github.com';
const TAXONOMY_DOCUMENTS_REPO = process.env.TAXONOMY_DOCUMENTS_REPO!;
const TAXONOMY_DOCUMENTS_REPO = process.env.NEXT_PUBLIC_TAXONOMY_DOCUMENTS_REPO!;
const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down Expand Up @@ -46,9 +46,6 @@ export async function POST(req: NextRequest) {
if (!repoForked) {
// Fork the repository if it is not already forked
await forkRepo(headers, repoOwner, repoName, githubUsername);
// Add a delay to ensure the fork operation completes to avoid a race condition when retrieving the bas SHA
// This only occurs if this is the first time submitting and the fork isn't present.
// TODO change to a retry
console.log('Pause 5s for the forking operation to complete');
await new Promise((resolve) => setTimeout(resolve, 5000));
console.log('Repository forked');
Expand All @@ -64,7 +61,8 @@ export async function POST(req: NextRequest) {
const [name, extension] = file.fileName.split(/\.(?=[^.]+$)/);
return {
fileName: `${name}-${timestamp}.${extension}`,
fileContent: file.fileContent
fileContent: file.fileContent,
encoding: extension === 'pdf' ? 'base64' : 'utf-8'
};
});

Expand Down Expand Up @@ -160,7 +158,7 @@ async function createFilesCommit(
owner: string,
repo: string,
branchName: string,
files: { fileName: string; fileContent: string }[],
files: { fileName: string; fileContent: string; encoding: string }[],
userEmail: string,
baseSha: string
): Promise<string> {
Expand All @@ -173,7 +171,7 @@ async function createFilesCommit(
headers,
body: JSON.stringify({
content: file.fileContent,
encoding: 'utf-8'
encoding: file.encoding
})
}).then((response) => response.json())
)
Expand Down Expand Up @@ -202,12 +200,9 @@ async function createFilesCommit(
}

const treeData = await createTreeResponse.json();
console.log('Tree created:', treeData);
// console.log('Tree created:', treeData);

// Create commit with DCO sign-off
// TODO: if the user's github does not have an associated github email, we need to specify one in the upload section
// or reuse the one from the form. If we use the email field from the form, it needs to be null checked when
// the user clicks the upload documents button.
const createCommitResponse = await fetch(`${GITHUB_API_URL}/repos/${owner}/${repo}/git/commits`, {
method: 'POST',
headers,
Expand Down
2 changes: 1 addition & 1 deletion src/app/edit-submission/knowledge/[id]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ Creator names: ${updatedAttributionData.creator_names}
className={useFileUpload ? 'button-active' : 'button-secondary'}
onClick={() => setUseFileUpload(true)}
>
Automatically Upload Documents
Upload Documents
</Button>
</div>
</FormGroup>
Expand Down
2 changes: 1 addition & 1 deletion src/components/AppLayout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ const AppLayout: React.FunctionComponent<IAppLayout> = ({ children }) => {
label: 'Contribute',
children: [
{ path: '/contribute/skill', label: 'Skill' },
{ path: '/contribute/knowledge', label: 'Knowledge' }
{ path: '/contribute/knowledge', label: 'Knowledge' },
]
},
{
Expand Down
100 changes: 100 additions & 0 deletions src/components/Contribute/Knowledge/FileSelectionModal.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// src/components/Contribute/Knowledge/FileSelectionModal.tsx
import React, { useEffect, useState } from 'react';
import { Modal, Button, DataList, DataListItem, DataListItemRow, DataListCell, DataListCheck, Spinner, Alert } from '@patternfly/react-core';
import { fetchGitHubRepoFiles } from '@/utils/fileManagerGithub';
import { useSession } from 'next-auth/react';
import { getGitHubUsername } from '@/utils/github';

interface FileSelectionModalProps {
isOpen: boolean;
onClose: () => void;
onSelectFiles: (files: string[]) => void;
repoName: string;
}

export const FileSelectionModal: React.FC<FileSelectionModalProps> = ({ isOpen, onClose, onSelectFiles, repoName }) => {
const { data: session } = useSession(); // Get the session data from NextAuth
const [files, setFiles] = useState<any[]>([]); // State for storing the list of files from the repository
const [loading, setLoading] = useState<boolean>(true); // State for managing the loading state
const [error, setError] = useState<string | null>(null);
const [selectedFiles, setSelectedFiles] = useState<string[]>([]); // State for storing selected files
const [githubUsername, setGithubUsername] = useState<string | null>(null);

useEffect(() => {
const loadFiles = async () => {
if (!session || !session.accessToken) {
setError('Unauthorized: Missing or invalid access token');
setLoading(false);
return;
}

try {
const username = await getGitHubUsername(session.accessToken as string);
setGithubUsername(username);
const repoFiles = await fetchGitHubRepoFiles(session.accessToken as string);
setFiles(repoFiles);
} catch (err) {
setError('Failed to load files');
console.error(err);
} finally {
setLoading(false);
}
};

loadFiles();
}, [session]);

const handleSelectFile = (filePath: string, isSelected: boolean) => {
setSelectedFiles((prevSelectedFiles) => (isSelected ? [...prevSelectedFiles, filePath] : prevSelectedFiles.filter((file) => file !== filePath)));
};

// Function to confirm the selection of files and close the modal
const handleConfirmSelection = () => {
// Pass selected files to the parent component
onSelectFiles(selectedFiles);
onClose();
};

return (
<Modal
title="Select PDF or Markdown files from your knowledge files repository on GitHub"
isOpen={isOpen}
onClose={onClose}
actions={[
<Button key="confirm" variant="primary" onClick={handleConfirmSelection}>
Confirm
</Button>,
<Button key="cancel" variant="link" onClick={onClose}>
Cancel
</Button>
]}
>
{loading && <Spinner size="lg" />}
{error && (
<Alert variant="danger" title="Error loading files">
{error}
</Alert>
)}
{!loading && !error && (
<DataList aria-label="File List">
{files.map((file, index) => (
<DataListItem key={index} aria-labelledby={`file-item-${index}`}>
<DataListItemRow>
<DataListCheck
aria-labelledby={`file-item-${index}`}
checked={selectedFiles.includes(file.path)}
onChange={(checked) => handleSelectFile(file.path, checked)}
/>
<DataListCell>
<span id={`file-item-${index}`}>{file.path}</span>
</DataListCell>
</DataListItemRow>
</DataListItem>
))}
</DataList>
)}
</Modal>
);
};

export default FileSelectionModal;
Loading

0 comments on commit 6ee74c7

Please sign in to comment.