Skip to content

Commit

Permalink
scraping with tabula(india_mopng_scrape.py)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbej-ea committed Jan 30, 2024
1 parent f4c5595 commit 898ba2b
Show file tree
Hide file tree
Showing 12 changed files with 236 additions and 10 deletions.
6 changes: 3 additions & 3 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
### This is your cloud-build template. Please replace any {__name__} with the appropriate Docker tag you want to give to
### This is your cloud-build template. Please replace any {india_mopng_etl} with the appropriate Docker tag you want to give to
### Your Docker image
steps:
# Get the github pem
Expand All @@ -14,11 +14,11 @@ steps:
entrypoint: 'sh'
args: [
'-c',
'docker build --build-arg SSH_PRIVATE_KEY="$(cat /root/.ssh/id_github)" -f Dockerfile -t gcr.io/$PROJECT_ID/{__name__} .'
'docker build --build-arg SSH_PRIVATE_KEY="$(cat /root/.ssh/id_github)" -f Dockerfile -t gcr.io/$PROJECT_ID/{india_mopng_etl} .'
]
volumes:
- name: 'ssh'
path: /root/.ssh

images: ['gcr.io/$PROJECT_ID/{__name__}']
images: ['gcr.io/$PROJECT_ID/{india_mopng_etl}']
timeout: '1200s'
93 changes: 93 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
attrs @ file:///C:/b/abs_35n0jusce8/croot/attrs_1695717880170/work
beautifulsoup4==4.12.3
boto3==1.34.27
botocore==1.34.27
Brotli @ file:///C:/Windows/Temp/abs_63l7912z0e/croots/recipe/brotli-split_1659616056886/work
bs4==0.0.2
cachetools==5.3.2
certifi @ file:///C:/b/abs_91u83siphd/croot/certifi_1700501720658/work/certifi
cffi @ file:///C:/b/abs_924gv1kxzj/croot/cffi_1700254355075/work
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
cryptography @ file:///C:/b/abs_e8cnom_zw_/croot/cryptography_1702071486468/work
cssselect==1.2.0
decorator==5.1.1
distro==1.9.0
Elixir==0.7.1
et-xmlfile==1.1.0
exceptiongroup @ file:///C:/b/abs_c5h1o1_b5b/croot/exceptiongroup_1706031441653/work
ghostscript==0.7
google-api-core==2.15.0
google-auth==2.27.0
google-cloud-core==2.4.1
google-cloud-storage==2.14.0
google-crc32c @ file:///C:/b/abs_f8g37ql__2/croot/google-crc32c_1667946622512/work
google-resumable-media==2.7.0
googleapis-common-protos==1.62.0
greenlet @ file:///C:/b/abs_a6c75ie0bc/croot/greenlet_1702060012174/work
h11==0.14.0
helper-functions-ea @ git+https://git@github.com/energyaspects/helper_functions.git@31531d83a80f324453173e05807145af4a583a2a
idna @ file:///C:/b/abs_bdhbebrioa/croot/idna_1666125572046/work
Jinja2==3.1.3
jmespath @ file:///C:/b/abs_59jpuaows7/croot/jmespath_1700144635019/work
JPype1==1.5.0
lxml==5.1.0
MarkupSafe==2.1.4
mkl-fft @ file:///C:/b/abs_19i1y8ykas/croot/mkl_fft_1695058226480/work
mkl-random @ file:///C:/b/abs_edwkj1_o69/croot/mkl_random_1695059866750/work
mkl-service==2.4.0
numpy @ file:///C:/b/abs_16b2j7ad8n/croot/numpy_and_numpy_base_1704311752418/work/dist/numpy-1.26.3-cp39-cp39-win_amd64.whl#sha256=02e606e23ca31bb00a40d147fd1ce4dd7d241395346a4196592d5abe54a333bc
opencv-python==4.9.0.80
openpyxl==3.1.2
outcome @ file:///tmp/build/80754af9/outcome_1609338780791/work
packaging==23.2
pandas==2.2.0
pbr==6.0.0
pdfminer.six==20231228
pdftopng==0.2.3
protobuf==4.25.2
psycopg2-binary==2.9.9
pyasn1==0.5.1
pyasn1-modules==0.3.0
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
PyMuPDFb==1.23.9
PyMySQL==1.1.0
pyOpenSSL @ file:///C:/b/abs_08f38zyck4/croot/pyopenssl_1690225407403/work
pypdf==4.0.0
PyPDF2==3.0.1
pyquery==2.0.0
PySocks @ file:///C:/ci/pysocks_1605307512533/work
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
python-dotenv==1.0.1
python-http-client==3.3.7
pytz @ file:///C:/b/abs_19q3ljkez4/croot/pytz_1695131651401/work
requests @ file:///C:/b/abs_316c2inijk/croot/requests_1690400295842/work
roman==4.1
rsa==4.9
s3transfer==0.10.0
selenium==4.17.2
sendgrid==6.11.0
shooju==3.8.13
six @ file:///tmp/build/80754af9/six_1644875935023/work
sniffio @ file:///C:/b/abs_3akdewudo_/croot/sniffio_1705431337396/work
sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work
soupsieve @ file:///C:/b/abs_bbsvy9t4pl/croot/soupsieve_1696347611357/work
SQLAlchemy==0.7.10
sqlalchemy-migrate==0.11.0
sqlparse==0.4.4
starkbank-ecdsa==2.2.0
tabula-py==2.9.0
tabulate==0.9.0
Tempita==0.5.2
trio @ file:///C:/b/abs_3bsokxbl8q/croot/trio_1705518572139/work
trio-websocket==0.11.1
typing_extensions @ file:///C:/b/abs_72cdotwc_6/croot/typing_extensions_1705599364138/work
tzdata==2023.4
urllib3 @ file:///C:/b/abs_9cmlsrm3ys/croot/urllib3_1698257595508/work
webdriver-manager==4.0.1
win-inet-pton @ file:///C:/ci/win_inet_pton_1605306162074/work
wsproto==1.2.0
xlrd==0.7.1
xlwt==0.7.2
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@
# if you are using jsons or csvs for your code then you need to include this to make sure that the package includes these data when it installs
entry_points={
'console_scripts': [
"name_of_task = __name__.main:main",
"name_of_task = india_mopng_etl.main:main",
# these are the commands that can be ran through bash. Make sure you specify the correct folder under the
# src path that includes the python script that you will run. In this instance it would be __name__.main: main
# src path that includes the python script that you will run. In this instance it would be india_mopng_etl.main: main
],
},

Expand Down
File renamed without changes.
133 changes: 133 additions & 0 deletions src/india_mopng_etl/india_mopng_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import requests
from bs4 import BeautifulSoup
# import io
# from PyPDF2 import PdfReader
from urllib.parse import urljoin
import tabula
import pandas as pd


def get_list_of_pdfs(url):
# # website to scrap
# # url = "https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
# url = url
# read = requests.get(url)
# html_content = read.content
# soup = BeautifulSoup(html_content, "html.parser")
#
# # created an empty list for putting the pdfs
# list_of_pdf = []
#
# # accessed all the anchors tags
# anchor_tags = soup.find_all('a')
#
# # iterate through anchor tags for checking the extension is .pdf
# for link in anchor_tags:
# if link.get('href').endswith('.pdf'):
# if not link.get('href').startswith('http'):
# list_of_pdf.append(urljoin(url, link.get('href')))
# else:
# # Adding the pdf links into the list_of_pdf
# list_of_pdf.append(link.get('href'))
# return list_of_pdf
return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/MPR-for-the-month-of-Feb,2021.pdf']
# return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/mprsep2023.pdf']
# return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/MPRjan2023.pdf']
# return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/mprjuly17.pdf']
# return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/ilovepdf_merged.pdf']


def get_pdf_info(pdf_path):
print(pdf_path)
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, lattice=True)

target_header_for_crude = "Crude Oil Production during the month of"
target_header_for_petroleum = "Production of Petroleum Products during the month of"
crude_target_table = None
petroleum_target_table = None

for table in tables:
if table.shape[0] > 0 and pd.notnull(table.iloc[0, 0]) and target_header_for_crude in str(table.iloc[0, 0]):
crude_target_table = table
elif table.shape[0] > 0 and pd.notnull(table.iloc[0, 0]) and target_header_for_petroleum in str(
table.iloc[0, 0]):
petroleum_target_table = table
else:
pass
if crude_target_table is not None and petroleum_target_table is not None:
break

if crude_target_table is not None and petroleum_target_table is not None:
if crude_target_table is not None:
break_point = None
for i, row in enumerate(crude_target_table.values):
for cell in row:
if "(1)" in str(cell):
break_point = i
break
if break_point is not None:
# Remove the upper rows from where main data are coming
crude_target_table = crude_target_table.iloc[break_point + 1:, :].reset_index(drop=True)
# Remove the last row
crude_target_table = crude_target_table.iloc[:-1]

# Rename columns based on positions
# Example positions and new names
column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
4: 'Production during the Preceding month of current year'}

# Rename columns based on positions
for position, new_name in column_positions_to_rename.items():
if position < len(
crude_target_table.columns): # Ensure the position is within the range of existing columns
crude_target_table.rename(columns={crude_target_table.columns[position]: new_name},
inplace=True)

# Select only the first and fifth columns
crude_target_table = crude_target_table.iloc[:,
[0, 4]] # Select all rows, and the first and fifth columns (zero-indexed)
print("*****************************CRUDE*************************")
print(crude_target_table)
if petroleum_target_table is not None:
break_point = None
for i, row in enumerate(petroleum_target_table.values):
for cell in row:
if "(1)" in str(cell):
break_point = i
break
if break_point is not None:
# Remove the upper rows from where main data are coming
petroleum_target_table = petroleum_target_table.iloc[break_point + 1:, :].reset_index(drop=True)
# Remove the last row
petroleum_target_table = petroleum_target_table.iloc[:-1]

# Rename columns based on positions
# Example positions and new names
column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
4: 'Production during the Preceding month of current year'}

# Rename columns based on positions
for position, new_name in column_positions_to_rename.items():
if position < len(
petroleum_target_table.columns): # Ensure the position is within the range of existing columns
petroleum_target_table.rename(columns={petroleum_target_table.columns[position]: new_name},
inplace=True)

# Select only the first and fifth columns
petroleum_target_table = petroleum_target_table.iloc[:,
[0, 4]] # Select all rows, and the first and fifth columns (zero-indexed)
print("*****************************PETROLEUM*************************")
print(petroleum_target_table)
else:
print("Target header not found in any table.")


def main():
url = "https://mopng.gov.in/en/petroleum-statistics/monthly-production"
list_of_pdf = get_list_of_pdfs(url)
for i in list_of_pdf:
get_pdf_info(i)


if __name__ == "__main__": # pragma: no cover
main()
4 changes: 2 additions & 2 deletions src/__name__/main.py → src/india_mopng_etl/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd

from helper_functions_ea import Logger
from __name__.metadata import metadata
from __name__.utils.base_classes import DataExtractor
from india_mopng_etl.metadata import metadata
from india_mopng_etl.utils.base_classes import DataExtractor


class __Class_Name__(DataExtractor): # make sure you rename the class to your preference
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __name__.utils import *
from india_mopng_etl.utils import *


class BaseVariables:
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import argparse
import pytest
import pandas as pd
from __name__.main import __Class_Name__
from __name__ import main
from india_mopng_etl.main import __Class_Name__
from india_mopng_etl import main

dummy_data = pd.DataFrame()

Expand Down

0 comments on commit 898ba2b

Please sign in to comment.