Skip to content

Commit

Permalink
Adding final version of download_sources
Browse files Browse the repository at this point in the history
  • Loading branch information
lcsrodriguez committed Jul 22, 2022
1 parent 4d7d358 commit c9d027b
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
49 changes: 44 additions & 5 deletions headat/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
"""
import numpy as np
import pandas as pd
import pyspark
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import wfdb as wf
from wfdb.io.convert import wfdb_to_wav, wfdb_to_edf
import scipy.io
import pyspark
import tqdm
import requests
import validators
import wget
Expand Down Expand Up @@ -119,19 +119,58 @@ def download_sources(self, url_parent_folder: str = ""):
"""

# Creation of a dedicated sub-folder named "samples/"
path_filename = f"{self.folder_name}/samples/"
path_filename = f"{self.folder_name}samples/"
self.samples_foldername = path_filename
print("ok")
if not os.path.isdir(path_filename) or not os.path.exists(path_filename):
try:
os.mkdir(path_filename)
return path_filename
except Exception as e:
raise Exception("An error has occured during the sub-folder creation process.\nError details: {e}")

print(path_filename)

# Processing the URL
# TODO
# Checking if the record name is an URL
if validators.url(url_parent_folder):
try:
url = urlparse(url_parent_folder)
print(f"URL : {url}")

# Restriction to the physionet.org webpages
if url.scheme == "https":
if url.netloc == "physionet.org":
if url.path.split("/")[1] == "files":
# Download the files

# Getting the list of files from url
r = requests.get(url.geturl())
data = r.text
soup = BeautifulSoup(data, "html.parser")

# Formatting the relevant files
links = {
k.get("href"): url.geturl() + k.get("href") for k in soup.find_all("a")[1:] if
k.get("href").split(".")[-1] in ["hea", "dat"]
}

# Downloading the files
for file, link in tqdm.tqdm(links.items()):
tqdm.tqdm.write(f"Processing link : {link}")
wget.download(url=link,
out=f"{self.samples_foldername}{file}",
bar=None)
print(f"Downloading from {url_parent_folder} completed successfully")
else:
raise ValueError("You have to specify a files/ subfolder")
else:
raise ValueError("Headat only covers the 'physionet.org' web resources.")
else:
raise ValueError("Headat only covers HTTPS protocol for web resources.")
except Exception as e:
raise Exception(f"An exception has occured during ")
# If not, it's a local file and we simply read it using wfdb
else:
raise ValueError("The argument specified is not a valid URL.")



Expand Down
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

a = HDView()
#a.add_record(file)
a.download_sources()
a.download_sources("https://physionet.org/files/aami-ec13/1.0.0/")

#a.add_record("https://physionet.org/files/aami-ec13/1.0.0/")
#a.t_html()
Expand Down

0 comments on commit c9d027b

Please sign in to comment.