Skip to content

Commit

Permalink
Added self scraping with Scrape.py and got rid of SerpAPI
Browse files Browse the repository at this point in the history
We got rid of SerpAPI and added a self-sustaining way to scrape with Scrape.py
  • Loading branch information
tf7software committed Sep 28, 2024
1 parent bdfa06f commit bed1406
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 141 deletions.
196 changes: 55 additions & 141 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,26 @@ const { GoogleGenerativeAI } = require('@google/generative-ai');
const axios = require('axios');
const rateLimit = require('express-rate-limit');
const validator = require('validator');
const { exec } = require('child_process'); // Import child_process to execute Python scripts
require('dotenv').config();

const app = express();
const PORT = 80;

// Run setup script to ensure Python and libraries are installed
exec('bash setup.sh', (error, stdout, stderr) => {
if (error) {
console.error(`Error executing setup script: ${error.message}`);
return;
}
if (stderr) {
console.error(`stderr: ${stderr}`);
return;
}
console.log(`stdout: ${stdout}`);
});


// Initialize Google Generative AI
const genAI = new GoogleGenerativeAI(process.env.API_KEY);
const model = genAI.getGenerativeModel({ model: "gemini-1.5-flash" });
Expand All @@ -34,9 +49,7 @@ app.get('/view', (req, res) => {
res.sendFile(path.join(__dirname, 'views/view.html'));
});



// Serve homepage
// Serve snake game
app.get('/snake', (req, res) => {
res.sendFile(path.join(__dirname, 'views/snake.html'));
});
Expand Down Expand Up @@ -68,70 +81,28 @@ const deleteArticlesFolder = () => {
// Schedule the deleteArticlesFolder function to run every 24 hours
setInterval(deleteArticlesFolder, 24 * 60 * 60 * 1000); // 24 hours in milliseconds

// Function to sanitize scraped data
const sanitizeScrapedData = (text) => {
return text.replace(/[\n\r]/g, ' ').trim(); // Remove newlines, trim whitespace
};

// Function to scrape search results from SerpAPI
const scrapeSerpApiSearch = async (query) => {
if (searchCache.has(query)) {
console.log("Serving from cache");
return searchCache.get(query);
}

const apiKey = process.env.SERPAPI_API_KEY;
const formattedQuery = encodeURIComponent(query);
const url = `https://serpapi.com/search.json?q=${formattedQuery}&api_key=${apiKey}`;

try {
const { data } = await axios.get(url);

if (!data.organic_results || !Array.isArray(data.organic_results)) {
console.error("No organic results found in the response.");
return [];
}

const links = data.organic_results.map(result => result.link).filter(link => link && link.startsWith('http'));
console.log("Collected URLs:", links);

// Cache the result for 24 hours
searchCache.set(query, links);
setTimeout(() => searchCache.delete(query), 24 * 60 * 60 * 1000);

return links;
} catch (error) {
console.error("Error scraping SerpAPI:", error);
return [];
}
};

// Function to scrape images from SerpAPI
const scrapeSerpApiImages = async (query) => {
if (searchCache.has(query)) {
console.log("Serving images from cache");
return searchCache.get(query);
}

const apiKey = process.env.SERPAPI_API_KEY;
const url = `https://serpapi.com/search.json?engine=google_images&q=${query}&api_key=${apiKey}`;

try {
const { data } = await axios.get(url);
const images = data.images_results.slice(0, 10).map(img => ({
thumbnail: img.thumbnail,
original: img.original
}));

// Cache the result for 24 hours
searchCache.set(query, images);
setTimeout(() => searchCache.delete(query), 24 * 60 * 60 * 1000);

return images;
} catch (error) {
console.error("Error scraping SerpAPI images:", error);
return [];
}
// Function to scrape search results using scrape.py
const scrapePySearch = (query) => {
return new Promise((resolve, reject) => {
const sanitizedQuery = query.replace(/[^a-zA-Z0-9 ]/g, ''); // sanitize query for shell
exec(`python3 scrape.py "${sanitizedQuery}" 10`, (error, stdout, stderr) => { // Limit to 10 results
if (error) {
console.error(`Error executing Python script: ${error.message}`);
reject(error);
}
if (stderr) {
console.error(`stderr from Python script: ${stderr}`);
}

try {
const results = JSON.parse(stdout);
resolve(results);
} catch (parseError) {
console.error(`Error parsing Python script output: ${parseError.message}`);
reject(parseError);
}
});
});
};

// Rate limiter to prevent too many requests
Expand All @@ -155,11 +126,12 @@ app.post('/search', limiter, async (req, res) => {
}

try {
const lookupResult = await scrapeSerpApiSearch(query);
// Fetch results from scrape.py
const lookupResult = await scrapePySearch(query);
console.log("Scraped URLs:", lookupResult);

if (!Array.isArray(lookupResult) || lookupResult.length === 0) {
const errorMsg = "No results found from SerpAPI. Please try a different query.";
const errorMsg = "No results found. Please try a different query.";
const articleHtml = fs.readFileSync(path.join(__dirname, 'views/template.html'), 'utf8')
.replace(/{{title}}/g, query)
.replace(/{{content}}/g, "No content generated as there were no URLs.")
Expand All @@ -178,70 +150,33 @@ app.post('/search', limiter, async (req, res) => {
articleHtml = articleHtml.replace(/{{title}}/g, query);
articleHtml = articleHtml.replace(/{{content}}/g, markdownContent);

const urlList = lookupResult.map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('');
console.log("Generated URL List:", urlList);
const urlList = lookupResult.map(url => `<li><a href="${url.link}" target="_blank">${url.title}</a></li>`).join('');
articleHtml = articleHtml.replace(/{{urls}}/g, urlList);

try {
const images = await scrapeSerpApiImages(query);
const imageGallery = images.length > 0
? images.map(img => `<img src="${img.thumbnail}" alt="${query} image">`).join('')
: "No images available";

articleHtml = articleHtml.replace(/{{imageGallery}}/g, imageGallery);
// Removed image gallery code

fs.writeFileSync(filePath, articleHtml);
res.redirect(`/articles/${sanitizedQuery}`);
} catch (imageError) {
console.error("Error generating the image gallery:", imageError);
res.status(500).send("Error generating the image gallery.");
}
fs.writeFileSync(filePath, articleHtml);
res.redirect(`/articles/${sanitizedQuery}`);
} catch (error) {
console.error("Error during the search process:", error.message);
res.status(500).send("An unexpected error occurred: " + error.message);
}
});

// Serve suggestions for the autocomplete feature
app.get('/suggest', (req, res) => {
const query = req.query.q.toLowerCase().replace(/-/g, ' ');
const articlesDir = path.join(__dirname, 'public/articles');

fs.readdir(articlesDir, (err, files) => {
if (err) {
return res.status(500).send([]);
}

const suggestions = files
.filter(file => {
const filename = file.replace('.html', '').toLowerCase();
return filename.includes(query);
})
.map(file => file.replace('.html', ''));

res.send(suggestions);
});
});

// Serve the generated article pages or create them if they don't exist
// Serve the generated article pages or create them if they don't exist
app.get('/articles/:article', async (req, res) => {
const article = req.params.article;
const filePath = path.join(__dirname, 'public/articles', `${article}.html`);

// Check if the file exists
if (fs.existsSync(filePath)) {
return res.sendFile(filePath);
}

try {
// Convert the article name back to a readable format
const query = article.replace(/-/g, ' ');
const query = article.replace(/-/g, ' ');

// Scrape information from SerpAPI
const lookupResult = await scrapeSerpApiSearch(query);
const lookupResult = await scrapePySearch(query);

// Check if any results were found
if (!Array.isArray(lookupResult) || lookupResult.length === 0) {
const errorMsg = "No content found for this article.";
const articleHtml = fs.readFileSync(path.join(__dirname, 'views/template.html'), 'utf8')
Expand All @@ -252,49 +187,28 @@ app.get('/articles/:article', async (req, res) => {
return res.sendFile(filePath);
}

// Generate a prompt for the AI content generation
const prompt = `You are Infintium. You have two purposes. If the user prompt is a math problem, solve it until it is COMPLETELY simplified. If it is a question, answer it with your own knowledge. If it is an item, such as a toaster, song, or anything that is a statement, act like Wikipedia and provide as much information as possible. USER PROMPT: ${query}`;

// Generate AI content using the prompt
const prompt = `You are Infintium. You have two purposes... USER PROMPT: ${query}`;
const result = await model.generateContent(prompt);
const markdownContent = markdown.render(result.response.text());

// Load the HTML template
let articleHtml = fs.readFileSync(path.join(__dirname, 'views/template.html'), 'utf8');

// Replace placeholders with the search query and AI content
articleHtml = articleHtml.replace(/{{title}}/g, query);
articleHtml = articleHtml.replace(/{{content}}/g, markdownContent);

// Create a list of URLs for the article
const urlList = lookupResult.map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('');
const urlList = lookupResult.map(url => `<li><a href="${url.link}" target="_blank">${url.title}</a></li>`).join('');
articleHtml = articleHtml.replace(/{{urls}}/g, urlList);

// Generate the image gallery in the article HTML
try {
const images = await scrapeSerpApiImages(query);

// Check if images were fetched successfully
const imageGallery = images.length > 0
? images.map(img => `<img src="${img.original}" alt="${query} image" style="width: 200px; height: auto; margin: 5px;">`).join('')
: '<p>No images available</p>';

articleHtml = articleHtml.replace(/{{imageGallery}}/g, imageGallery);
// Removed image gallery code

// Save the generated HTML file
fs.writeFileSync(filePath, articleHtml);
res.sendFile(filePath);
} catch (imageError) {
console.error("Error generating the image gallery:", imageError);
res.status(500).send("Error generating the image gallery.");
}
fs.writeFileSync(filePath, articleHtml);
res.sendFile(filePath);
} catch (error) {
console.error("Error generating the article:", error);
console.error("Error during the article generation process:", error.message);
res.status(500).send("An unexpected error occurred: " + error.message);
}
});


// Start the server
app.listen(PORT, () => {
console.log(`Server is running on port ${PORT}`);
console.log(`Server is running on http://localhost:${PORT}`);
});
6 changes: 6 additions & 0 deletions node_modules/.package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions node_modules/child_process/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions node_modules/child_process/package.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"@google/generative-ai": "^0.19.0",
"axios": "^1.7.7",
"cheerio": "^1.0.0",
"child_process": "^1.0.2",
"dotenv": "^16.4.5",
"express": "^4.21.0",
"express-rate-limit": "^7.4.0",
Expand Down
Loading

0 comments on commit bed1406

Please sign in to comment.