scraping with tabula(india_mopng_scrape.py)

energyaspects · Jan 30, 2024 · 898ba2b · 898ba2b
1 parent f4c5595
commit 898ba2b
Show file tree

Hide file tree

Showing 12 changed files with 236 additions and 10 deletions.
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -1,4 +1,4 @@
-### This is your cloud-build template. Please replace any {__name__} with the appropriate Docker tag you want to give to
+### This is your cloud-build template. Please replace any {india_mopng_etl} with the appropriate Docker tag you want to give to
 ### Your Docker image
 steps:
   # Get the github pem
@@ -14,11 +14,11 @@ steps:
     entrypoint: 'sh'
     args: [
         '-c',
-        'docker build --build-arg SSH_PRIVATE_KEY="$(cat /root/.ssh/id_github)" -f Dockerfile -t gcr.io/$PROJECT_ID/{__name__} .'
+        'docker build --build-arg SSH_PRIVATE_KEY="$(cat /root/.ssh/id_github)" -f Dockerfile -t gcr.io/$PROJECT_ID/{india_mopng_etl} .'
     ]
     volumes:
       - name: 'ssh'
         path: /root/.ssh
 
-images: ['gcr.io/$PROJECT_ID/{__name__}']
+images: ['gcr.io/$PROJECT_ID/{india_mopng_etl}']
 timeout: '1200s'
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,93 @@
+attrs @ file:///C:/b/abs_35n0jusce8/croot/attrs_1695717880170/work
+beautifulsoup4==4.12.3
+boto3==1.34.27
+botocore==1.34.27
+Brotli @ file:///C:/Windows/Temp/abs_63l7912z0e/croots/recipe/brotli-split_1659616056886/work
+bs4==0.0.2
+cachetools==5.3.2
+certifi @ file:///C:/b/abs_91u83siphd/croot/certifi_1700501720658/work/certifi
+cffi @ file:///C:/b/abs_924gv1kxzj/croot/cffi_1700254355075/work
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+cryptography @ file:///C:/b/abs_e8cnom_zw_/croot/cryptography_1702071486468/work
+cssselect==1.2.0
+decorator==5.1.1
+distro==1.9.0
+Elixir==0.7.1
+et-xmlfile==1.1.0
+exceptiongroup @ file:///C:/b/abs_c5h1o1_b5b/croot/exceptiongroup_1706031441653/work
+ghostscript==0.7
+google-api-core==2.15.0
+google-auth==2.27.0
+google-cloud-core==2.4.1
+google-cloud-storage==2.14.0
+google-crc32c @ file:///C:/b/abs_f8g37ql__2/croot/google-crc32c_1667946622512/work
+google-resumable-media==2.7.0
+googleapis-common-protos==1.62.0
+greenlet @ file:///C:/b/abs_a6c75ie0bc/croot/greenlet_1702060012174/work
+h11==0.14.0
+helper-functions-ea @ git+https://git@github.com/energyaspects/helper_functions.git@31531d83a80f324453173e05807145af4a583a2a
+idna @ file:///C:/b/abs_bdhbebrioa/croot/idna_1666125572046/work
+Jinja2==3.1.3
+jmespath @ file:///C:/b/abs_59jpuaows7/croot/jmespath_1700144635019/work
+JPype1==1.5.0
+lxml==5.1.0
+MarkupSafe==2.1.4
+mkl-fft @ file:///C:/b/abs_19i1y8ykas/croot/mkl_fft_1695058226480/work
+mkl-random @ file:///C:/b/abs_edwkj1_o69/croot/mkl_random_1695059866750/work
+mkl-service==2.4.0
+numpy @ file:///C:/b/abs_16b2j7ad8n/croot/numpy_and_numpy_base_1704311752418/work/dist/numpy-1.26.3-cp39-cp39-win_amd64.whl#sha256=02e606e23ca31bb00a40d147fd1ce4dd7d241395346a4196592d5abe54a333bc
+opencv-python==4.9.0.80
+openpyxl==3.1.2
+outcome @ file:///tmp/build/80754af9/outcome_1609338780791/work
+packaging==23.2
+pandas==2.2.0
+pbr==6.0.0
+pdfminer.six==20231228
+pdftopng==0.2.3
+protobuf==4.25.2
+psycopg2-binary==2.9.9
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
+PyMuPDFb==1.23.9
+PyMySQL==1.1.0
+pyOpenSSL @ file:///C:/b/abs_08f38zyck4/croot/pyopenssl_1690225407403/work
+pypdf==4.0.0
+PyPDF2==3.0.1
+pyquery==2.0.0
+PySocks @ file:///C:/ci/pysocks_1605307512533/work
+python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
+python-dotenv==1.0.1
+python-http-client==3.3.7
+pytz @ file:///C:/b/abs_19q3ljkez4/croot/pytz_1695131651401/work
+requests @ file:///C:/b/abs_316c2inijk/croot/requests_1690400295842/work
+roman==4.1
+rsa==4.9
+s3transfer==0.10.0
+selenium==4.17.2
+sendgrid==6.11.0
+shooju==3.8.13
+six @ file:///tmp/build/80754af9/six_1644875935023/work
+sniffio @ file:///C:/b/abs_3akdewudo_/croot/sniffio_1705431337396/work
+sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1623949099177/work
+soupsieve @ file:///C:/b/abs_bbsvy9t4pl/croot/soupsieve_1696347611357/work
+SQLAlchemy==0.7.10
+sqlalchemy-migrate==0.11.0
+sqlparse==0.4.4
+starkbank-ecdsa==2.2.0
+tabula-py==2.9.0
+tabulate==0.9.0
+Tempita==0.5.2
+trio @ file:///C:/b/abs_3bsokxbl8q/croot/trio_1705518572139/work
+trio-websocket==0.11.1
+typing_extensions @ file:///C:/b/abs_72cdotwc_6/croot/typing_extensions_1705599364138/work
+tzdata==2023.4
+urllib3 @ file:///C:/b/abs_9cmlsrm3ys/croot/urllib3_1698257595508/work
+webdriver-manager==4.0.1
+win-inet-pton @ file:///C:/ci/win_inet_pton_1605306162074/work
+wsproto==1.2.0
+xlrd==0.7.1
+xlwt==0.7.2
diff --git a/setup.py b/setup.py
@@ -28,9 +28,9 @@
     # if you are using jsons or csvs for your code then you need to include this to make sure that the package includes these data when it installs
     entry_points={
         'console_scripts': [
-            "name_of_task = __name__.main:main",
+            "name_of_task = india_mopng_etl.main:main",
             # these are the commands that can be ran through bash. Make sure you specify the correct folder under the
-            # src path that includes the python script that you will run. In this instance it would be __name__.main: main
+            # src path that includes the python script that you will run. In this instance it would be india_mopng_etl.main: main
         ],
     },
 

diff --git a/src/__name__/__init__.py → src/india_mopng_etl/__init__.py b/src/__name__/__init__.py → src/india_mopng_etl/__init__.py
diff --git a/src/india_mopng_etl/india_mopng_scrape.py b/src/india_mopng_etl/india_mopng_scrape.py
@@ -0,0 +1,133 @@
+import requests
+from bs4 import BeautifulSoup
+# import io
+# from PyPDF2 import PdfReader
+from urllib.parse import urljoin
+import tabula
+import pandas as pd
+
+
+def get_list_of_pdfs(url):
+    # # website to scrap
+    # # url = "https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
+    # url = url
+    # read = requests.get(url)
+    # html_content = read.content
+    # soup = BeautifulSoup(html_content, "html.parser")
+    #
+    # # created an empty list for putting the pdfs
+    # list_of_pdf = []
+    #
+    # # accessed all the anchors tags
+    # anchor_tags = soup.find_all('a')
+    #
+    # # iterate through anchor tags for checking the extension is .pdf
+    # for link in anchor_tags:
+    #     if link.get('href').endswith('.pdf'):
+    #         if not link.get('href').startswith('http'):
+    #             list_of_pdf.append(urljoin(url, link.get('href')))
+    #         else:
+    #             # Adding the pdf links into the list_of_pdf
+    #             list_of_pdf.append(link.get('href'))
+    # return list_of_pdf
+    return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/MPR-for-the-month-of-Feb,2021.pdf']
+    # return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/mprsep2023.pdf']
+    # return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/MPRjan2023.pdf']
+    # return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/mprjuly17.pdf']
+    # return ['https://mopng.gov.in/files/petroleumStatistics/monthlyProduction/ilovepdf_merged.pdf']
+
+
+def get_pdf_info(pdf_path):
+    print(pdf_path)
+    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, lattice=True)
+
+    target_header_for_crude = "Crude Oil Production during the month of"
+    target_header_for_petroleum = "Production of Petroleum Products during the month of"
+    crude_target_table = None
+    petroleum_target_table = None
+
+    for table in tables:
+        if table.shape[0] > 0 and pd.notnull(table.iloc[0, 0]) and target_header_for_crude in str(table.iloc[0, 0]):
+            crude_target_table = table
+        elif table.shape[0] > 0 and pd.notnull(table.iloc[0, 0]) and target_header_for_petroleum in str(
+                table.iloc[0, 0]):
+            petroleum_target_table = table
+        else:
+            pass
+        if crude_target_table is not None and petroleum_target_table is not None:
+            break
+
+    if crude_target_table is not None and petroleum_target_table is not None:
+        if crude_target_table is not None:
+            break_point = None
+            for i, row in enumerate(crude_target_table.values):
+                for cell in row:
+                    if "(1)" in str(cell):
+                        break_point = i
+                        break
+            if break_point is not None:
+                # Remove the upper rows from where main data are coming
+                crude_target_table = crude_target_table.iloc[break_point + 1:, :].reset_index(drop=True)
+                # Remove the last row
+                crude_target_table = crude_target_table.iloc[:-1]
+
+                # Rename columns based on positions
+                # Example positions and new names
+                column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
+                                              4: 'Production during the Preceding month of current year'}
+
+                # Rename columns based on positions
+                for position, new_name in column_positions_to_rename.items():
+                    if position < len(
+                            crude_target_table.columns):  # Ensure the position is within the range of existing columns
+                        crude_target_table.rename(columns={crude_target_table.columns[position]: new_name},
+                                                  inplace=True)
+
+            # Select only the first and fifth columns
+            crude_target_table = crude_target_table.iloc[:,
+                                 [0, 4]]  # Select all rows, and the first and fifth columns (zero-indexed)
+            print("*****************************CRUDE*************************")
+            print(crude_target_table)
+        if petroleum_target_table is not None:
+            break_point = None
+            for i, row in enumerate(petroleum_target_table.values):
+                for cell in row:
+                    if "(1)" in str(cell):
+                        break_point = i
+                        break
+            if break_point is not None:
+                # Remove the upper rows from where main data are coming
+                petroleum_target_table = petroleum_target_table.iloc[break_point + 1:, :].reset_index(drop=True)
+                # Remove the last row
+                petroleum_target_table = petroleum_target_table.iloc[:-1]
+
+                # Rename columns based on positions
+                # Example positions and new names
+                column_positions_to_rename = {0: 'Name of Undertaking/Unit/State',
+                                              4: 'Production during the Preceding month of current year'}
+
+                # Rename columns based on positions
+                for position, new_name in column_positions_to_rename.items():
+                    if position < len(
+                            petroleum_target_table.columns):  # Ensure the position is within the range of existing columns
+                        petroleum_target_table.rename(columns={petroleum_target_table.columns[position]: new_name},
+                                                      inplace=True)
+
+            # Select only the first and fifth columns
+            petroleum_target_table = petroleum_target_table.iloc[:,
+                                     [0, 4]]  # Select all rows, and the first and fifth columns (zero-indexed)
+            print("*****************************PETROLEUM*************************")
+            print(petroleum_target_table)
+    else:
+        print("Target header not found in any table.")
+
+
+def main():
+    url = "https://mopng.gov.in/en/petroleum-statistics/monthly-production"
+    list_of_pdf = get_list_of_pdfs(url)
+    for i in list_of_pdf:
+        get_pdf_info(i)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
diff --git a/src/__name__/main.py → src/india_mopng_etl/main.py b/src/__name__/main.py → src/india_mopng_etl/main.py
@@ -1,8 +1,8 @@
 import pandas as pd
 
 from helper_functions_ea import Logger
-from __name__.metadata import metadata
-from __name__.utils.base_classes import DataExtractor
+from india_mopng_etl.metadata import metadata
+from india_mopng_etl.utils.base_classes import DataExtractor
 
 
 class __Class_Name__(DataExtractor):  # make sure you rename the class to your preference

diff --git a/src/__name__/metadata/__init__.py → src/india_mopng_etl/metadata/__init__.py b/src/__name__/metadata/__init__.py → src/india_mopng_etl/metadata/__init__.py
diff --git a/src/__name__/metadata/metadata_example.xlsx → ..._mopng_etl/metadata/metadata_example.xlsx b/src/__name__/metadata/metadata_example.xlsx → ..._mopng_etl/metadata/metadata_example.xlsx
diff --git a/src/__name__/utils/__init__.py → src/india_mopng_etl/utils/__init__.py b/src/__name__/utils/__init__.py → src/india_mopng_etl/utils/__init__.py
diff --git a/src/__name__/utils/base_classes.py → src/india_mopng_etl/utils/base_classes.py b/src/__name__/utils/base_classes.py → src/india_mopng_etl/utils/base_classes.py
@@ -1,4 +1,4 @@
-from __name__.utils import *
+from india_mopng_etl.utils import *
 
 
 class BaseVariables:

diff --git a/src/__name__/utils/helper_functions.py → ...india_mopng_etl/utils/helper_functions.py b/src/__name__/utils/helper_functions.py → ...india_mopng_etl/utils/helper_functions.py
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,8 +1,8 @@
 import argparse
 import pytest
 import pandas as pd
-from __name__.main import __Class_Name__
-from __name__ import main
+from india_mopng_etl.main import __Class_Name__
+from india_mopng_etl import main
 
 dummy_data = pd.DataFrame()