From 028025519652c86ba3598cf460437fc40f581d5f Mon Sep 17 00:00:00 2001 From: Lukasz Przychodzien Date: Sun, 21 Jan 2024 15:08:12 -0500 Subject: [PATCH 1/4] orange book init --- .../view-ndc_to_orange_book_te_code.sql | 22 ----- .../orange_book/int_fda_ndc_to_te.sql | 21 +++++ .../orange_book/_orange_book__models.yml | 1 + .../orange_book/_orange_book__sources.yml | 86 +++++++++++++++++++ 4 files changed, 108 insertions(+), 22 deletions(-) delete mode 100644 airflow/dags/orange_book/view-ndc_to_orange_book_te_code.sql create mode 100644 dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql create mode 100644 dbt/sagerx/models/staging/orange_book/_orange_book__models.yml create mode 100644 dbt/sagerx/models/staging/orange_book/_orange_book__sources.yml diff --git a/airflow/dags/orange_book/view-ndc_to_orange_book_te_code.sql b/airflow/dags/orange_book/view-ndc_to_orange_book_te_code.sql deleted file mode 100644 index db0b4ad9..00000000 --- a/airflow/dags/orange_book/view-ndc_to_orange_book_te_code.sql +++ /dev/null @@ -1,22 +0,0 @@ -/* flatfile.ndc_to_orange_book_te_code */ --- NOTE: this requires both staging.stg_fda_ndc__ndc and datasource.orange_book --- DISCLAIMER: because I can't figure out NDC-level mapping, I only include applications with a single OB TE code -CREATE OR REPLACE VIEW intermediate.ndc_to_orange_book_te_code -AS - WITH cte AS ( - SELECT - fda.ndc11 - , obp.te_code - , COUNT(fda.ndc11) OVER( PARTITION BY fda.ndc11 ) AS num_te_codes - FROM datasource.orange_book_products obp - INNER JOIN staging.stg_fda_ndc__ndc fda ON concat(CASE WHEN obp.appl_type = 'A' THEN 'ANDA' ELSE 'NDA' END, obp.appl_no) = fda.applicationnumber - GROUP BY fda.ndc11, obp.te_code - ) - SELECT - fda.ndc11 - , fda.applicationnumber AS application_number - , cte.te_code - , LEFT(cte.te_code, 2) AS first_two_te_code - , LEFT(cte.te_code, 1) AS first_one_te_code - FROM staging.stg_fda_ndc__ndc fda - INNER JOIN cte ON fda.ndc11 = cte.ndc11 AND cte.num_te_codes = 1 \ No newline at end of file diff --git a/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql b/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql new file mode 100644 index 00000000..f2094816 --- /dev/null +++ b/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql @@ -0,0 +1,21 @@ +-- DISCLAIMER: because I can't figure out NDC-level mapping, I only include applications with a single OB TE code +WITH cte AS ( + SELECT + fda.ndc11 + , obp.te_code + , COUNT(fda.ndc11) OVER( PARTITION BY fda.ndc11 ) AS num_te_codes + FROM {{ source('orange_book', 'orange_book_products') }} AS obp + INNER JOIN {{ ref('staging.stg_fda_ndc__ndc') }} AS fda + ON concat(CASE WHEN obp.appl_type = 'A' THEN 'ANDA' ELSE 'NDA' END, obp.appl_no) = fda.applicationnumber + GROUP BY fda.ndc11, obp.te_code +) +SELECT + fda.ndc11 + , fda.applicationnumber AS application_number + , cte.te_code + , LEFT(cte.te_code, 2) AS first_two_te_code + , LEFT(cte.te_code, 1) AS first_one_te_code +FROM {{ ref('staging.stg_fda_ndc__ndc') }} AS fda +INNER JOIN cte + ON fda.ndc11 = cte.ndc11 + AND cte.num_te_codes = 1 \ No newline at end of file diff --git a/dbt/sagerx/models/staging/orange_book/_orange_book__models.yml b/dbt/sagerx/models/staging/orange_book/_orange_book__models.yml new file mode 100644 index 00000000..22817d2a --- /dev/null +++ b/dbt/sagerx/models/staging/orange_book/_orange_book__models.yml @@ -0,0 +1 @@ +version: 2 diff --git a/dbt/sagerx/models/staging/orange_book/_orange_book__sources.yml b/dbt/sagerx/models/staging/orange_book/_orange_book__sources.yml new file mode 100644 index 00000000..73d7abfd --- /dev/null +++ b/dbt/sagerx/models/staging/orange_book/_orange_book__sources.yml @@ -0,0 +1,86 @@ +version: 2 + +sources: + - name: orange_book + description: > + The publication Approved Drug Products with Therapeutic Equivalence Evaluations (commonly known as the Orange Book) + identifies drug products approved on the basis of safety and effectiveness by the Food and Drug Administration (FDA) + under the Federal Food, Drug, and Cosmetic Act (the Act) and related patent and exclusivity information. + For more information on the Orange Book including its history, see the Orange Book Preface. + schema: datasource + tables: + - name: orange_book_exlusivity + description: Related exclusivity information of drug products approved on the basis of safety and effectiveness by the Food and Drug Administration (FDA) under the Federal Food, Drug, and Cosmetic Act (the Act). + columns: + - name: appl_type + description: The type of new drug application approval. New Drug Applications (NDA or innovator) are 'N'. Abbreviated New Drug Applications (ANDA or generic) are 'A'. + - name: appl_no + description: The FDA assigned New Drug Application (NDA) number to the application. Format is nnnnnn. + - name: product_no + description: The FDA assigned number to identify the application products. Each strength is a separate product. May repeat for multiple part products. Format is nnn. + - name: exclusivity_code + description: Code to designate exclusivity granted by the FDA to a drug product. Format is nnnnnnnnnn. + - name: exclusivity_date + description: The date the exclusivity expires. Format is MMM DD, YYYY. + + - name: orange_book_patent + description: Related patent information of drug products approved on the basis of safety and effectiveness by the Food and Drug Administration (FDA) under the Federal Food, Drug, and Cosmetic Act (the Act). + columns: + - name: appl_type + description: The type of new drug application approval. New Drug Applications (NDA or innovator) are 'N'. Abbreviated New Drug Applications (ANDA or generic) are 'A'. + - name: appl_no + description: The FDA assigned number to the New Drug Application (NDA). Format is nnnnnn. + - name: product_no + description: The FDA assigned number to identify the application products. Each strength is a separate product. May repeat for multiple part products. Format is nnn. + - name: patent_no + description: Patent numbers as submitted by the applicant holder for patents covered by the statutory provisions. May repeat for multiple applications and multiple products. Includes pediatric exclusivity granted by the agency. Format is nnnnnnnnnnn. + - name: patent_expire_date_text + description: The date the patent expires as submitted by the applicant holder including applicable extensions. The format is MMM DD, YYYY. + - name: drug_substance_flag + description: Patents submitted on FDA Form 3542 and listed after August 18, 2003 may have a drug substance flag indicating the sponsor submitted the patent as claiming the drug substance. Format is Y or null. + - name: drug_product_flag + description: Patents submitted on FDA Form 3542 and listed after August 18, 2003 may have a drug product flag indicating the sponsor submitted the patent as claiming the drug product. Format is Y or null. + - name: patent_use_code + description: Code to designate a use patent that covers the approved indication or use of a drug product. May repeat for multiple applications, multiple products and multiple patents. Format is nnnnnnnnnn. + - name: delist_flag + description: > + Sponsor has requested patent be delisted. + This patent has remained listed because, under Section 505(j)(5)(D)(i) of the Act, a first applicant may retain eligibility for 180-day exclusivity based on a paragraph IV certification to this patent for a certain period. + Applicants under Section 505(b)(2) are not required to certify to patents where this flag is set to Y. + Format is Y or null. + - name: submission_date + description: The date on which the FDA receives patent information from the new drug application (NDA) holder. Format is Mmm d, yyyy. + + - name: orange_book_products + description: Drug products approved on the basis of safety and effectiveness by the Food and Drug Administration (FDA) under the Federal Food, Drug, and Cosmetic Act (the Act). + columns: + - name: ingredient + description: The active ingredient(s) for the product. Multiple ingredients are in alphabetical order, separated by a semicolon. + - name: df_route + description: The product dosage form and route separated by a semi-colon. The format is not all uppercase. + - name: trade_name + description: The trade name of the product as shown on the labeling. + - name: applicant + description: The firm name holding legal responsibility for the new drug application. The firm name is condensed to a maximum twenty character unique string. + - name: strength + description: The potency of the active ingredient. May repeat for multiple part products. + - name: appl_type + description: The type of new drug application approval. New Drug Applications (NDA or innovator) are ā€Nā€. Abbreviated New Drug Applications (ANDA or generic) are ā€œAā€. + - name: appl_no + description: The FDA assigned number to the application. Format is nnnnnn. + - name: product_no + description: The FDA assigned number to identify the application products. Each strength is a separate product. May repeat for multiple part products. Format is nnn. + - name: te_code + description: The Therapeutic Equivalence (TE) Code indicates the therapeutic equivalence rating of generic to innovator Rx products. + - name: approval_date + description: > + The date the product was approved as stated in the FDA approval letter to the applicant. + The format is Mmm dd, yyyy. Products approved prior to the January 1, 1982 contain the phrase: 'Approved prior to Jan 1, 1982'. + - name: rld + description: The Reference Listed Drug (RLD) is a drug product approved under section 505(c) of the FD&C Act for which FDA has made a finding of safety and effectiveness. In the electronic Orange Book, an RLD is identified by 'RLD' in the RLD column. + - name: rs + description: A 'reference standard' is the drug product selected by FDA that an applicant seeking approval of an ANDA must use in conducting an in vivo bioequivalence study required for approval of an ANDA. In the electronic Orange Book, a reference standard is identified by 'RS' in the RS column. + - name: type + description: The group or category of approved drugs. Format is RX, OTC, DISCN. + - name: applicant_full_name + description: The full name of the firm holding legal responsibility for the new drug application. From f087ea8701a4197e926f6058de4fa09edc77d3f6 Mon Sep 17 00:00:00 2001 From: Lukasz Przychodzien Date: Sun, 21 Jan 2024 15:26:09 -0500 Subject: [PATCH 2/4] int update --- .../models/intermediate/orange_book/int_fda_ndc_to_te.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql b/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql index f2094816..a02b2ec4 100644 --- a/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql +++ b/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql @@ -5,7 +5,7 @@ WITH cte AS ( , obp.te_code , COUNT(fda.ndc11) OVER( PARTITION BY fda.ndc11 ) AS num_te_codes FROM {{ source('orange_book', 'orange_book_products') }} AS obp - INNER JOIN {{ ref('staging.stg_fda_ndc__ndc') }} AS fda + INNER JOIN {{ ref('stg_fda_ndc__ndcs') }} AS fda ON concat(CASE WHEN obp.appl_type = 'A' THEN 'ANDA' ELSE 'NDA' END, obp.appl_no) = fda.applicationnumber GROUP BY fda.ndc11, obp.te_code ) @@ -15,7 +15,7 @@ SELECT , cte.te_code , LEFT(cte.te_code, 2) AS first_two_te_code , LEFT(cte.te_code, 1) AS first_one_te_code -FROM {{ ref('staging.stg_fda_ndc__ndc') }} AS fda +FROM {{ ref('stg_fda_ndc__ndcs') }} AS fda INNER JOIN cte ON fda.ndc11 = cte.ndc11 AND cte.num_te_codes = 1 \ No newline at end of file From 694df32991ee3ca6e0b35d2ee682d98288968f37 Mon Sep 17 00:00:00 2001 From: Joey LeGrand Date: Sun, 21 Jan 2024 23:49:34 -0600 Subject: [PATCH 3/4] Change name of Airflow SQL files --- .../orange_book/load-orange_book_patent.sql | 18 --------------- ...k_exclusivity.sql => load_exclusivity.sql} | 0 airflow/dags/orange_book/load_patent.sql | 18 +++++++++++++++ airflow/dags/orange_book/load_products.sql | 22 +++++++++++++++++++ 4 files changed, 40 insertions(+), 18 deletions(-) delete mode 100644 airflow/dags/orange_book/load-orange_book_patent.sql rename airflow/dags/orange_book/{load-orange_book_exclusivity.sql => load_exclusivity.sql} (100%) create mode 100644 airflow/dags/orange_book/load_patent.sql create mode 100644 airflow/dags/orange_book/load_products.sql diff --git a/airflow/dags/orange_book/load-orange_book_patent.sql b/airflow/dags/orange_book/load-orange_book_patent.sql deleted file mode 100644 index 6f2a320f..00000000 --- a/airflow/dags/orange_book/load-orange_book_patent.sql +++ /dev/null @@ -1,18 +0,0 @@ -/* datasource.orange_book_patent */ -DROP TABLE IF EXISTS datasource.orange_book_patent; - -CREATE TABLE datasource.orange_book_patent ( -Appl_Type TEXT, -Appl_No TEXT, -Product_No TEXT, -Patent_No TEXT, -Patent_Expire_Date_Text TEXT, -Drug_Substance_Flag TEXT, -Drug_Product_Flag TEXT, -Patent_Use_Code TEXT, -Delist_Flag TEXT, -Submission_Date TEXT -); - -COPY datasource.orange_book_patent -FROM '{data_path}/patent.txt' DELIMITER '~' CSV HEADER; \ No newline at end of file diff --git a/airflow/dags/orange_book/load-orange_book_exclusivity.sql b/airflow/dags/orange_book/load_exclusivity.sql similarity index 100% rename from airflow/dags/orange_book/load-orange_book_exclusivity.sql rename to airflow/dags/orange_book/load_exclusivity.sql diff --git a/airflow/dags/orange_book/load_patent.sql b/airflow/dags/orange_book/load_patent.sql new file mode 100644 index 00000000..7747b297 --- /dev/null +++ b/airflow/dags/orange_book/load_patent.sql @@ -0,0 +1,18 @@ +/* datasource.orange_book_patent */ +DROP TABLE IF EXISTS datasource.orange_book_patent; + +CREATE TABLE datasource.orange_book_patent ( +appl_type TEXT, +appl_no TEXT, +product_no TEXT, +patent_no TEXT, +patent_expire_date_text TEXT, +drug_substance_flag TEXT, +drug_product_flag TEXT, +patent_use_code TEXT, +delist_flag TEXT, +submission_date TEXT +); + +COPY datasource.orange_book_patent +FROM '{data_path}/patent.txt' DELIMITER '~' CSV HEADER; \ No newline at end of file diff --git a/airflow/dags/orange_book/load_products.sql b/airflow/dags/orange_book/load_products.sql new file mode 100644 index 00000000..a0502eaa --- /dev/null +++ b/airflow/dags/orange_book/load_products.sql @@ -0,0 +1,22 @@ +/* datasource.orange_book_products */ +DROP TABLE IF EXISTS datasource.orange_book_products; + +CREATE TABLE datasource.orange_book_products ( +ingredient TEXT, +df_route TEXT, +trade_name TEXT, +applicant TEXT, +strength TEXT, +appl_type TEXT, +appl_no TEXT, +product_no TEXT, +te_code TEXT, +approval_date TEXT, +rld TEXT, +rs TEXT, +type TEXT, +applicant_full_name TEXT +); + +COPY datasource.orange_book_products +FROM '{data_path}/products.txt' DELIMITER '~' CSV HEADER; \ No newline at end of file From e96d14c168a232912cf251608fff85f1f2c5d7b1 Mon Sep 17 00:00:00 2001 From: Joey LeGrand Date: Sun, 21 Jan 2024 23:50:06 -0600 Subject: [PATCH 4/4] Document intermediate model --- .../orange_book/load-orange_book_products.sql | 22 ------------- .../orange_book/_int_orange_book__models.yml | 25 ++++++++++++++ .../orange_book/int_fda_ndc_to_te.sql | 33 ++++++++++--------- 3 files changed, 42 insertions(+), 38 deletions(-) delete mode 100644 airflow/dags/orange_book/load-orange_book_products.sql create mode 100644 dbt/sagerx/models/intermediate/orange_book/_int_orange_book__models.yml diff --git a/airflow/dags/orange_book/load-orange_book_products.sql b/airflow/dags/orange_book/load-orange_book_products.sql deleted file mode 100644 index a0502eaa..00000000 --- a/airflow/dags/orange_book/load-orange_book_products.sql +++ /dev/null @@ -1,22 +0,0 @@ -/* datasource.orange_book_products */ -DROP TABLE IF EXISTS datasource.orange_book_products; - -CREATE TABLE datasource.orange_book_products ( -ingredient TEXT, -df_route TEXT, -trade_name TEXT, -applicant TEXT, -strength TEXT, -appl_type TEXT, -appl_no TEXT, -product_no TEXT, -te_code TEXT, -approval_date TEXT, -rld TEXT, -rs TEXT, -type TEXT, -applicant_full_name TEXT -); - -COPY datasource.orange_book_products -FROM '{data_path}/products.txt' DELIMITER '~' CSV HEADER; \ No newline at end of file diff --git a/dbt/sagerx/models/intermediate/orange_book/_int_orange_book__models.yml b/dbt/sagerx/models/intermediate/orange_book/_int_orange_book__models.yml new file mode 100644 index 00000000..fabd23c7 --- /dev/null +++ b/dbt/sagerx/models/intermediate/orange_book/_int_orange_book__models.yml @@ -0,0 +1,25 @@ +version: 2 + +models: + - name: int_fda_ndc_to_te + description: | + > DISCLAIMER: This model is under development and incomplete. + + This is an attempt at an NDC-level mapping of Orange Book TE codes. There are cases in Orange Book where two different strengths of a given ANDA have different TE codes. This means within an ANDA, two different NDCs could have different TE codes. + + The problem we're encountering is that there's not a reliable / programmatic way to get from the strength of a product in Orange Book and the strength of the product in the FDA NDC Directory. + + Because I can't figure out NDC-level mapping, I only include applications with a single OB TE code. + + Again - please only treat this as the beginning of a proof of concept and do not use for clinical or other purposes. + columns: + - name: ndc11 + description: The NDC11 of the product. + - name: application_number + description: The ANDA / NDA / etc number. + - name: te_code + description: The full therapeutic equvalency (TE) code as listed within Orange Book. + - name: first_two_te_code + description: Just the first two characters of the TE code - for ease of use downstream. + - name: first_one_te_code + description: Just the first character of the TE code - for ease of use downstream. diff --git a/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql b/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql index a02b2ec4..1f138dea 100644 --- a/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql +++ b/dbt/sagerx/models/intermediate/orange_book/int_fda_ndc_to_te.sql @@ -1,21 +1,22 @@ --- DISCLAIMER: because I can't figure out NDC-level mapping, I only include applications with a single OB TE code -WITH cte AS ( - SELECT +-- DISCLAIMER: This model is under development and incomplete. + +with cte as ( + select fda.ndc11 , obp.te_code - , COUNT(fda.ndc11) OVER( PARTITION BY fda.ndc11 ) AS num_te_codes - FROM {{ source('orange_book', 'orange_book_products') }} AS obp - INNER JOIN {{ ref('stg_fda_ndc__ndcs') }} AS fda - ON concat(CASE WHEN obp.appl_type = 'A' THEN 'ANDA' ELSE 'NDA' END, obp.appl_no) = fda.applicationnumber - GROUP BY fda.ndc11, obp.te_code + , count(fda.ndc11) over( partition by fda.ndc11 ) as num_te_codes + from {{ source('orange_book', 'orange_book_products') }} as obp + inner join {{ ref('stg_fda_ndc__ndcs') }} as fda + on concat(case when obp.appl_type = 'A' then 'ANDA' else 'NDA' end, obp.appl_no) = fda.applicationnumber + group by fda.ndc11, obp.te_code ) -SELECT +select fda.ndc11 - , fda.applicationnumber AS application_number + , fda.applicationnumber as application_number , cte.te_code - , LEFT(cte.te_code, 2) AS first_two_te_code - , LEFT(cte.te_code, 1) AS first_one_te_code -FROM {{ ref('stg_fda_ndc__ndcs') }} AS fda -INNER JOIN cte - ON fda.ndc11 = cte.ndc11 - AND cte.num_te_codes = 1 \ No newline at end of file + , left(cte.te_code, 2) as first_two_te_code + , left(cte.te_code, 1) as first_one_te_code +from {{ ref('stg_fda_ndc__ndcs') }} as fda +inner join cte + on fda.ndc11 = cte.ndc11 + and cte.num_te_codes = 1