Skip to content

Commit

Permalink
add more scrappers to yml
Browse files Browse the repository at this point in the history
  • Loading branch information
Victor Fernandes committed Jan 25, 2024
1 parent 6043e2f commit 00e49d8
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 132 deletions.
141 changes: 9 additions & 132 deletions src/background.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,140 +164,19 @@ function getScrapperOptionsByUrl(url: string, title: string): ScrapperOptions |
url.includes('https://my.pitchbook.com/search-results') &&
(url.includes('deals') || url.includes('companies') || url.includes('investors'))
) {
options = {
parseTables: {
header: title,
tables: [
{
rows: '#search-results-data-table-left .data-table__row',
cols: '.data-table__cell',
},
{
rows: '#search-results-data-table-right .data-table__row, #search-results-data-table-right .data-table__headers',
cols: '.data-table__cell',
},
],
mergeTablesBy: 'column',
},
};
options = scrapperOptions.pitchbook;
} else if (url.includes('finance.yahoo.com/quote/') && url.includes('financials')) {
options = {
parseTables: {
header: title,
tables: [
{ rows: '[class*="(tbhg)"]>[class*="(tbr)"]', cols: 'div > span' },
{ rows: '[class*="(tbr)"]', cols: '[data-test="fin-col"], [title]' },
],
mergeTablesBy: 'row',
},
};
options = scrapperOptions.yahooFinance;
} else if (url.includes('www.netflix.com/browse')) {
options = {
header: 'Netflix browse results',
listElementsQuery: '.title-card',
elementParser: [
{ title: 'Cover', query: 'img', type: 'image' },
{ title: 'Title', query: '.fallback-text', type: 'text' },
{ title: 'Link', query: 'a', type: 'clean-url' },
],
};
options = scrapperOptions.netflix;
} else if (url.includes('yellowpages.com/search')) {
options = {
header: title,
listElementsQuery: '.result',
elementParser: [
{ title: 'Logo', query: 'img', type: 'image' },
{ title: 'Name', query: '.business-name', type: 'text' },
{ title: 'Phone number', query: '.phone', type: 'text' },
{ title: 'Address', query: '.adr', type: 'text' },
{ title: 'Categories', query: '.categories', type: 'text' },
{ title: 'Website', query: '.track-visit-website', type: 'link' },
],
};
} else if (url.includes('yelp.com/search')) {
options = {
header: title,
listElementsQuery: '[data-testid="serp-ia-card"]',
elementParser: [
{ title: 'Image', query: 'img', type: 'image' },
{ title: 'Name', query: '[class*="businessName_"]', type: 'text' },
{
title: 'Rating',
query: 'span[data-font-weight="semibold"]',
type: 'text',
},
{
title: 'Categories',
query: '[class*="priceCategory"]',
type: 'text',
},
{
title: 'Yelp link',
query: '[class*="businessName_"] a',
type: 'clean-url',
},
],
};
options = scrapperOptions.yellowPages;
} else if (url.includes('yelp.') && url.includes('/search')) {
options = scrapperOptions.yelp;
} else if (url.includes('zillow.com') && (url.includes('/for_') || url.includes('?search'))) {
options = {
header: title,
listElementsQuery: '[data-test="property-card"]',
elementParser: [
{ title: 'Image', query: 'img', type: 'image' },
{
title: 'Address',
query: '[data-test="property-card-addr"]',
type: 'text',
},
{
title: 'Price',
query: '[data-test="property-card-price"]',
type: 'text',
},
{
title: 'Bedrooms',
query: 'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(1)',
type: 'text',
},
{
title: 'Bathrooms',
query: 'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(2)',
type: 'text',
},
{
title: 'Area',
query: 'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(3)',
type: 'text',
},
{
title: 'Zillow link',
query: '[data-test="property-card-link"]',
type: 'link',
},
],
};
options = scrapperOptions.zillow;
} else if (url.includes('ebay.com/sch/')) {
options = {
header: title,
listElementsQuery: 'ul > [id*="item"]',
elementParser: [
{ title: 'Image', query: 'img', type: 'image' },
{ title: 'Name', query: '.s-item__title', type: 'text' },
{ title: 'Price', query: '.s-item__price', type: 'text' },
{ title: 'State', query: '.s-item__subtitle', type: 'text' },
{ title: 'From', query: '.s-item__itemLocation', type: 'text' },
{
title: 'Seller info',
query: '.s-item__seller-info-text',
type: 'text',
},
{
title: 'Product link',
query: '.s-item__info > a',
type: 'clean-url',
},
],
};
options = scrapperOptions.ebay;
} else if (url.includes('google.com/maps/search')) {
options = scrapperOptions.googleMapsSearchOptions;
} else if (url.includes('kuantokusta.')) {
Expand Down Expand Up @@ -327,9 +206,7 @@ async function scrap() {
const tab = await getCurrentTab();
const options = getScrapperOptionsByUrl(tab.url!, tab.title!);

const elements = await runScrapper(options);

return elements;
return await runScrapper(options);
}

async function storeRowsXData(tsv: string, tabId: number) {
Expand Down
29 changes: 29 additions & 0 deletions src/scrappers/ebay.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
listElementsQuery: 'ul > [id*="item"]'
elementParser:
- title: Image
query: img
type: image

- title: Name
query: .s-item__title
type: text

- title: Price
query: .s-item__price
type: text

- title: State
query: .s-item__subtitle
type: text

- title: From
query: .s-item__itemLocation
type: text

- title: Seller info
query: .s-item__seller-info-text
type: text

- title: Product link
query: .s-item__info > a
type: clean-url
14 changes: 14 additions & 0 deletions src/scrappers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ import productHuntOptions from './producthunt.yml';
import googleMapsSearchOptions from './google-maps-search.yml';
import tiktokAccounts from './tik-tok-accounts.yml';
import tiktokSearch from './tik-tok-search.yml';
import netflix from './netflix.yml';
import ebay from './ebay.yml';
import zillow from './zillow.yml';
import yelp from './yelp.yml';
import yahooFinance from './yahoo-finance.yml';
import yellowPages from './yellow-pages.yml';
import pitchbook from './pitchbook.yml';

export default {
youtubeOptions,
Expand All @@ -21,4 +28,11 @@ export default {
googleMapsSearchOptions,
tiktokAccounts,
tiktokSearch,
netflix,
ebay,
zillow,
yelp,
yahooFinance,
yellowPages,
pitchbook,
};
14 changes: 14 additions & 0 deletions src/scrappers/netflix.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
header: Netflix browse results
listElementsQuery: .title-card
elementParser:
- title: Cover
query: img
type: image

- title: Title
query: .fallback-text
type: text

- title: Link
query: a
type: clean-url
9 changes: 9 additions & 0 deletions src/scrappers/pitchbook.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
parseTables:
tables:
- rows: '#search-results-data-table-left .data-table__row'
cols: .data-table__cell

- rows: '#search-results-data-table-right .data-table__row, #search-results-data-table-right .data-table__headers'
cols: .data-table__cell

mergeTablesBy: column
9 changes: 9 additions & 0 deletions src/scrappers/yahoo-finance.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
parseTables:
tables:
- rows: '[class*="(tbhg)"]>[class*="(tbr)"]'
cols: div > span

- rows: '[class*="(tbr)"]'
cols: '[data-test="fin-col"], [title]'

mergeTablesBy: row
25 changes: 25 additions & 0 deletions src/scrappers/yellow-pages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
listElementsQuery: .result
elementParser:
- title: Logo
query: img
type: image

- title: Name
query: .business-name
type: text

- title: Phone number
query: .phone
type: text

- title: Address
query: .adr
type: text

- title: Categories
query: .categories
type: text

- title: Website
query: .track-visit-website
type: link
21 changes: 21 additions & 0 deletions src/scrappers/yelp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
listElementsQuery: '[data-testid="serp-ia-card"]'
elementParser:
- title: Image
query: img
type: image

- title: Name
query: '[class*="businessName_"]'
type: text

- title: Rating
query: 'span[data-font-weight="semibold"]'
type: text

- title: Categories
query: '[class*="priceCategory"]'
type: text

- title: Yelp link
query: '[class*="businessName_"] a'
type: clean-url
29 changes: 29 additions & 0 deletions src/scrappers/zillow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
listElementsQuery: '[data-test="property-card"]'
elementParser:
- title: Image
query: img
type: image

- title: Address
query: '[data-test="property-card-addr"]'
type: text

- title: Price
query: '[data-test="property-card-price"]'
type: text

- title: Bedrooms
query: 'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(1)'
type: text

- title: Bathrooms
query: 'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(2)'
type: text

- title: Area
query: 'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(3)'
type: text

- title: Zillow link
query: '[data-test="property-card-link"]'
type: link

0 comments on commit 00e49d8

Please sign in to comment.