diff --git a/_quarto.yml b/_quarto.yml index db57a5d..9f440cb 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -41,7 +41,8 @@ website: contents: - sections/0_causality/causal_intro/article/intro-causality.qmd - sections/0_causality/open_data_citation_advantage.qmd - - sections/0_causality/social_causality.qmd + - sections/0_causality/social_causality.qmd + - sections/0_causality/open_data_cost_savings.qmd - title: Open Science contents: diff --git a/references.bib b/references.bib index fa53afe..da932c4 100644 --- a/references.bib +++ b/references.bib @@ -41,6 +41,28 @@ @article{aksnes2019 langid = {en} } +@article{ali-khan2018, + title = {Defining Success in Open Science}, + author = {Ali-Khan, Sarah E. and Jean, Antoine and MacDonald, Emily and Gold, E. Richard}, + year = {2018}, + month = {03}, + journal = {MNI Open Research}, + pages = {2}, + volume = {2}, + doi = {10.12688/mniopenres.12780.2} +} + +@article{ali2022, + title = {The Role of FAIR Data towards Sustainable Agricultural Performance: A Systematic Literature Review}, + author = {Ali, Basharat and Dahlhaus, Peter}, + year = {2022}, + journal = {Agriculture}, + pages = {309}, + volume = {12}, + number = {2}, + doi = {10.3390/agriculture12020309} +} + @book{almerud2022, title = {Knowledge ecosystems in the new ERA: using a competence based approach for career development in academia and beyond}, author = {Almerud, Mikaela and Ricksten, Maria and {O{\textquoteright}Neill}, Gareth and Weijden, Inge van der and Kaltenbrunner, Wolfgang and {Núñez}, Lidia and De Coen, An}, @@ -128,6 +150,40 @@ @article{armitage2020 url = {https://doi.org/10.1162/qss_a_00071} } +@article{arshad2016, + title = {Open Access Could Transform Drug Discovery: A Case Study of JQ1}, + author = {Arshad, Zeeshaan and Smith, James and Roberts, Mackenna and Lee, Wen Hwa and Davies, Ben and Bure, Kim and Hollander, Georg A. and Dopson, Sue and Bountra, Chas and Brindley, David}, + year = {2016}, + journal = {Expert Opinion on Drug Discovery}, + pages = {321--332}, + volume = {11}, + number = {3}, + doi = {10.1517/17460441.2016.1144587} +} + +@article{arzberger_promoting_2004, + title = {Promoting {Access} to {Public} {Research} {Data} for {Scientific}, {Economic}, and {Social} {Development}}, + volume = {3}, + issn = {1683-1470}, + doi = {10.2481/dsj.3.135}, + language = {en}, + journal = {Data Science Journal}, + author = {Arzberger, P and Schroeder, P and Beaulieu, A and Bowker, G and Casey, K and Laaksonen, L and Moorman, D and Uhlir, P and Wouters, P}, + year = {2004}, + pages = {135--152} +} + +@article{assen2014, + title = {Why Publishing Everything Is More Effective than Selective Publishing of Statistically Significant Results}, + author = {Assen, Marcel A. L. M. van and Aert, Robbie C. M. van and Nuijten, {Michèle B.} and Wicherts, Jelte M.}, + year = {2014}, + journal = {PLOS ONE}, + pages = {e84896}, + volume = {9}, + number = {1}, + doi = {10.1371/journal.pone.0084896} +} + @article{baker2016, title = {1,500 scientists lift the lid on reproducibility}, author = {Baker, Monya}, @@ -155,6 +211,33 @@ @article{balke2012 url = {https://www.tandfonline.com/doi/abs/10.1080/01621459.1997.10474074} } +@techreport{beagrie2012, + title = {Economic Impact Evaluation of the Economic and Social Data Service}, + author = {Beagrie, N. and Houghton, J.}, + year = {2012} +} + +@techreport{beagrie2014, + title = {The Value and Impact of Data Sharing and Curation. A synthesis of three recent studies of UK research data centres}, + author = {Beagrie, Neil and Houghton, John}, + year = {2014} +} + + +@techreport{beagrie2016, + title = {The Value and Impact of the European Bioinformatics Institute}, + author = {Beagrie, N. and Houghton, J.}, + year = {2016}, + url = {https://www.embl.org/documents/wp-content/uploads/2021/09/EMBL-EBI_Impact_report-2016-summary.pdf} +} + + +@techreport{beagrie2021, + title = {Data-driven discovery: The value and impact of EMBL-EBI managed data resources}, + author = {Beagrie, Neil and Houghton, John}, + year = {2021} +} + @book{bellis2009, title = {Bibliometrics and Citation Analysis: From the Science Citation Index to Cybermetrics}, author = {Bellis, Nicola De}, @@ -240,7 +323,6 @@ @article{brembs2019 url = {http://dx.doi.org/10.1371/journal.pbio.3000117} } - @techreport{brown2016, title = {The GRIM test: A simple technique detects numerous anomalies in the reporting of results in psychology}, author = {Brown, Nicholas J. L. and Heathers, James A.}, @@ -252,7 +334,6 @@ @techreport{brown2016 langid = {en} } - @article{bryan2021, title = {The impact of open access mandates on invention}, author = {Bryan, Kevin A. and Ozcan, Yasin}, @@ -278,6 +359,7 @@ @article{budi2022 doi = {10.1007/s11192-022-04567-4} } + @article{butler_explaining_2003, title = {Explaining {Australia}'s increased share of {ISI} publications—the effects of a funding formula based on publication counts}, volume = {32}, @@ -291,6 +373,7 @@ @article{butler_explaining_2003 pages = {143--155} } + @article{carlin2023, title = {Where is all the research software? An analysis of software in UK academic repositories}, author = {Carlin, Domhnall and Rainer, Austen and Wilson, David}, @@ -302,7 +385,6 @@ @article{carlin2023 url = {https://peerj.com/articles/cs-1546/}, note = {Publisher: PeerJ Inc.} } - @article{castelnovo2023, title = {A quasi-experimental design to assess the innovative impact of public procurement: An application to the Italian space industry}, author = {Castelnovo, Paolo and {Clò}, Stefano and Florio, Massimo}, @@ -350,6 +432,42 @@ @inproceedings{cazacu2023 address = {New York, NY, USA} } +@inproceedings{chan_cost_2015, + title = {Cost {Impact} in {Managing} the {Transition} to an {Open} {Access} {Model}}, + isbn = {978-1-941269-03-9}, + doi = {10.5703/1288284315595}, + booktitle = {The {Importance} of {Being} {Earnest}}, + publisher = {Against the Grain}, + author = {Chan, Gayle Rosemary}, + month = sep, + year = {2015}, + pages = {358--362} +} + +@inbook{chataway2018, + title = {How Will Open Science Impact on University/Industry Collaborations?}, + author = {Chataway, Joanna and Parks, Sarah and Smith, Elta}, + editor = {Meissner, Dirk and Erdil, Erkan and Chataway, Joanna}, + year = {2018}, + publisher = {Springer International Publishing}, + pages = {265--282}, + series = {Science, Technology and Innovation Studies}, + url = {https://doi.org/10.1007/978-3-319-62649-9_12}, + note = {DOI: 10.1007/978-3-319-62649-9{\_}12}, + address = {Cham} +} + +@inproceedings{chen2017, + title = {2017 IEEE International Symposium on Multimedia (ISM)}, + author = {Chen, Sheng-Chih and Chen, Yi-Cheng and Chen, Wei-Lin}, + year = {2017}, + publisher = {IEEE}, + pages = {469--474}, + doi = {10.1109/ISM.2017.93}, + url = {http://ieeexplore.ieee.org/document/8241659/}, + address = {Taichung} +} + @inproceedings{cinelli_sensitivity_2019, title = {Sensitivity {Analysis} of {Linear} {Structural} {Causal} {Models}}, url = {https://proceedings.mlr.press/v97/cinelli19a.html}, @@ -381,7 +499,6 @@ @misc{codeof langid = {en} } - @article{cohen2002, title = {Links and Impacts: The Influence of Public Research on Industrial R&D}, author = {Cohen, Wesley M. and Nelson, Richard R. and Walsh, John P.}, @@ -397,7 +514,6 @@ @article{cohen2002 langid = {en} } - @article{colavizza2020, title = {The citation advantage of linking publications to research data}, author = {Colavizza, Giovanni and Hrynaszkiewicz, Iain and Staden, Isla and Whitaker, Kirstie and McGillivray, Barbara}, @@ -413,6 +529,7 @@ @article{colavizza2020 note = {Publisher: Public Library of Science}, langid = {en} } + @article{cole_chance_1981, title = {Chance and consensus in peer review}, volume = {214}, @@ -495,6 +612,18 @@ @misc{datasci url = {https://edison-project.eu/data-science-competence-framework-cf-ds/} } +@article{davies2016, + title = {Researching the emerging impacts of open data: revisiting the ODDC conceptual framework}, + author = {Davies, Tim and Perini, Fernando}, + year = {2016}, + month = {06}, + date = {2016-06-21}, + journal = {The Journal of Community Informatics}, + volume = {12}, + number = {2}, + doi = {10.15353/joci.v12i2.3246} +} + @article{davis_open_2008, title = {Open access publishing, article downloads, and citations: {Randomised} controlled trial}, volume = {337}, @@ -671,6 +800,15 @@ @article{dulongderosnay2014 note = {Publisher: Universidad de Talca} } +@book{EC-DGRI2018, + title = {Cost-benefit analysis for FAIR research data: cost of not having FAIR research data.}, + author = {{European Commission. Directorate General for Research and Innovation.} and {PwC EU Services.}}, + year = {2018}, + publisher = {Publications Office}, + doi = {10.2777/02999}, + url = {https://data.europa.eu/doi/10.2777/02999} +} + @book{EC2018, title = {Cost-benefit analysis for FAIR research data: cost of not having FAIR research data}, year = {2018}, @@ -713,6 +851,16 @@ @article{estimati2015 langid = {en} } +@book{EU2020, + title = {The economic impact of open data: opportunities for value creation in Europe}, + author = {Publications Office of the European Union, and Huyer, Esther and Knippenberg, Laura van}, + year = {2020}, + publisher = {Publications Office of the European Union}, + doi = {10.2830/63132}, + url = {https://data.europa.eu/doi/10.2830/63132}, + langid = {eng} +} + @article{fang2020, title = {An extensive analysis of the presence of altmetric data for Web of Science publications across subject fields and research topics}, author = {Fang, Zhichao and Costas, Rodrigo and Tian, Wencan and Wang, Xianwen and Wouters, Paul}, @@ -1000,6 +1148,7 @@ @article{gordon2021 langid = {en} } + @article{gormally2012, title = {Developing a Test of Scientific Literacy Skills (TOSLS): Measuring Undergraduates{\textquoteright} Evaluation of Scientific Information and Arguments}, author = {Gormally, Cara and Brickman, Peggy and Lutz, Mary}, @@ -1011,6 +1160,7 @@ @article{gormally2012 doi = {10.1187/cbe.12-03-0026} } + @misc{goyal_causal_2024, title = {Causal {Effect} of {Group} {Diversity} on {Redundancy} and {Coverage} in {Peer}-{Reviewing}}, doi = {10.48550/arXiv.2411.11437}, @@ -1028,6 +1178,7 @@ @article{grimme langid = {en} } + @article{group_fair_2020, title = {{FAIR} {Data} {Maturity} {Model}. {Specification} and {Guidelines}}, doi = {10.15497/rda00050}, @@ -1077,6 +1228,18 @@ @article{hagen2008 note = {Publisher: Public Library of Science San Francisco, USA} } +@inproceedings{harding2017, + title = {Expanding Perspectives on Open Science: Communities, Cultures and Diversity in Concepts and Practices}, + booktitle = {Proceedings of the 21st International Conference on Electronic Publishing}, + editor = {Chan, Leslie and Loizides, Fernando}, + author = {Harding, Rachel J.}, + year = {2017}, + date = {2017}, + pages = {1--5}, + address = {Limassol, Cyprus} +} + + @article{hardwicke2018, title = {Data availability, reusability, and analytic reproducibility: evaluating the impact of a mandatory open data policy at the journal {\emph{Cognition}}}, author = {Hardwicke, Tom E. and Mathur, Maya B. and MacDonald, Kyle and Nilsonne, Gustav and Banks, George C. and Kidwell, Mallory C. and Hofelich Mohr, Alicia and Clayton, Elizabeth and Yoon, Erica J. and Henry Tessler, Michael and Lenne, Richie L. and Altman, Sara and Long, Bria and Frank, Michael C.}, @@ -1091,6 +1254,7 @@ @article{hardwicke2018 url = {https://royalsocietypublishing.org/doi/10.1098/rsos.180448} } + @article{hardwicke2021, title = {Analytic reproducibility in articles receiving open data badges at the journal Psychological Science: an observational study}, author = {Hardwicke, Tom E. and Bohn, Manuel and MacDonald, Kyle and Hembacher, Emily and Nuijten, {Michèle B.} and Peloquin, Benjamin N. and deMayo, Benjamin E. and Long, Bria and Yoon, Erica J. and Frank, Michael C.}, @@ -1141,6 +1305,32 @@ @book{heathers2018a note = {Publisher: OSF} } +@article{heckman1979, + title = {Sample Selection Bias as a Specification Error}, + author = {Heckman, James J.}, + year = {1979}, + journal = {Econometrica}, + pages = {153--161}, + volume = {47}, + number = {1}, + doi = {10.2307/1912352} +} + +@inproceedings{herala2016, + title = {2016 SAI Computing Conference (SAI)}, + author = {Herala, Antti and Vanhala, Erno and Porras, Jari and Krri, Timo}, + year = {2016}, + pages = {715--724}, + doi = {10.1109/SAI.2016.7556060} +} + +@techreport{houghton2011, + title = {Access to research and technical information in Denmark}, + author = {Houghton, John and Swan, Alma and Brown, Sheridan}, + year = {2011}, + url = {https://eprints.soton.ac.uk/272603/} +} + @misc{howison_softcite_2023, title = {Softcite {Dataset} {Version} 2}, doi = {10.5281/zenodo.7995565}, @@ -1162,7 +1352,6 @@ @misc{hunermund_causal_2023 year = {2023} } - @inproceedings{hunter2015, title = {Formal Acknowledgement of Citizen Scientists{\textquoteright} Contributions via Dynamic Data Citations}, author = {Hunter, Jane and Hsu, Chih-Hsiang}, @@ -1176,7 +1365,6 @@ @inproceedings{hunter2015 langid = {en} } - @book{huntington-klein_effect_2021, title = {The {Effect}: {An} {Introduction} to {Research} {Design} and {Causality}}, isbn = {978-1-00-050914-4}, @@ -1198,7 +1386,6 @@ @book{huyer2020 langid = {eng} } - @article{istrate, title = {A large dataset of software mentions in the biomedical literature}, author = {Istrate, Ana-Maria and Li, Donghui and Taraborelli, Dario and Torkar, Michaela and Veytsman, Boris and Williams, Ivana}, @@ -1255,7 +1442,6 @@ @article{janssens langid = {en} } - @article{johnston2017, title = {Contemporary Guidance for Stated Preference Studies}, author = {Johnston, Robert J. and Boyle, Kevin J. and Adamowicz, {Wiktor (Vic)} and Bennett, Jeff and Brouwer, Roy and Cameron, Trudy Ann and Hanemann, W. Michael and Hanley, Nick and Ryan, Mandy and Scarpa, Riccardo and Tourangeau, Roger and Vossler, Christian A.}, @@ -1271,7 +1457,6 @@ @article{johnston2017 note = {Publisher: The University of Chicago Press} } - @book{jung2023, title = {scrutiny: Error Detection in Science}, author = {Jung, Lukas and Allard, {Aurélien}}, @@ -1281,6 +1466,15 @@ @book{jung2023 url = {https://cran.r-project.org/web/packages/scrutiny/index.html} } +@article{karasz2024, + title = {PathOS - D1.3 Key Impact Pathways for the Open Science Framework}, + author = {Karasz, Istvan and Stoy, Lennart and Seminaroti, Elisa and Grapengiesser, Izabella Martins}, + year = {2024}, + doi = {10.5281/ZENODO.11108567}, + url = {https://zenodo.org/doi/10.5281/zenodo.11108567}, + langid = {en} +} + @article{keller2014, title = {Re-use of public sector information in cultural heritage institutions}, author = {Keller, Paul and Margoni, Thomas and Rybicka, Katarzyna and Tarkowski, Alek}, @@ -1541,6 +1735,14 @@ @article{Levontin2022 pages = {25} } +@inproceedings{lindman2014, + title = {2014 47th Hawaii International Conference on System Sciences}, + author = {Lindman, Juho and Kinnari, Tomi and Rossi, Matti}, + year = {2014}, + pages = {739--748}, + doi = {10.1109/HICSS.2014.99} +} + @article{liu2023, title = {Data, measurement and empirical methods in the science of science}, author = {Liu, Lu and Jones, Benjamin F. and Uzzi, Brian and Wang, Dashun}, @@ -1660,6 +1862,20 @@ @article{marino2018 langid = {en} } +@article{mazzucato_entrepreneurial_2011, + title = {The entrepreneurial state}, + volume = {49}, + issn = {1362-6620}, + doi = {10.3898/136266211798411183}, + language = {en}, + number = {49}, + journal = {Soundings}, + author = {Mazzucato, Mariana}, + month = nov, + year = {2011}, + pages = {131--142} +} + @article{mccaffrey2020, title = {Open Science Skills Visualisation}, author = {McCaffrey, Ciara and Meyer, Thorsten and Riera Quintero, Clara and Swiatek, Cecile and Marcerou-Ramel, Nathalie and {Gillén}, Camilla and Clavel, Karin and Wojciechowska, Anna and Brinken, Helene and Prevoo, {Mariëlle} and Egerton, Frank}, @@ -1683,6 +1899,19 @@ @book{mcelreath2020 address = {Boca Raton} } +@article{mcmanamay_openaccess_2014, + title = {Open‐{Access} {Databases} as {Unprecedented} {Resources} and {Drivers} of {Cultural} {Change} in {Fisheries} {Science}}, + volume = {39}, + issn = {0363-2415, 1548-8446}, + doi = {10.1080/03632415.2014.946128}, + number = {9}, + journal = {Fisheries}, + author = {McManamay, Ryan A. and Utz, Ryan M.}, + month = sep, + year = {2014}, + pages = {417--425} +} + @book{merton1973, title = {The Sociology of Science: Theoretical and Empirical Investigations}, author = {Merton, Robert K.}, @@ -1892,6 +2121,7 @@ @book{nuijten2023 url = {https://cran.r-project.org/web/packages/statcheck/} } + @book{ocarroll2017, title = {Providing researchers with the skills and competencies they need to practise Open Science}, author = {{O'Carroll}, Conor and Hyllseth, Berit and Berg, Rinske van den and Kohl, Ulrike and Kamerlin, Caroline Lynn and Brennan, Niamh and {O{\textquoteright}Neill}, Gareth}, @@ -1938,6 +2168,7 @@ @techreport{oneill2022 langid = {eng} } + @misc{oneill2022_data, author = {Gareth O'Neill and Stefania Martziou}, howpublished = {Zenodo}, @@ -1946,6 +2177,7 @@ @misc{oneill2022_data url = {https://doi.org/10.5281/zenodo.7431678} } + @techreport{oneill2022a, title = {Monitoring Framework for National Contributions to EOSC}, author = {{O'Neill}, Gareth}, @@ -2037,6 +2269,14 @@ @article{page2021 note = {Publisher: British Medical Journal Publishing Group} } +@techreport{parsons_benefits_2011, + address = {Horsham, United Kingdom}, + title = {Benefits to the {Private} {Sector} of {Open} {Access} to {Higher} {Education} and {Scholarly} {Research}}, + institution = {HOST Policy Research}, + author = {Parsons, David and Willis, Dick and Holland, Jane}, + year = {2011} +} + @article{pasquetto2017, title = {On the Reuse of Scientific Data}, author = {Pasquetto, Irene V. and Randles, Bernadette M. and Borgman, Christine L.}, @@ -2161,7 +2401,6 @@ @article{radicchi2008 url = {http://www.pnas.org/cgi/content/abstract/105/45/17268} } - @article{rafols_monitoring_2024, title = {Monitoring {Open} {Science} as transformative change: {Towards} a systemic framework}, shorttitle = {Monitoring {Open} {Science} as transformative change}, @@ -2214,7 +2453,6 @@ @inbook{roberts2007 url = {https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.4324/9780203824696-29&type=chapterpdf} } - @inbook{roberts2013, title = {Scientific literacy/science literacy}, author = {Roberts, Douglas A.}, @@ -2225,7 +2463,6 @@ @inbook{roberts2013 url = {https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.4324/9780203824696-29&type=chapterpdf} } - @inbook{roberts2013a, title = {Scientific literacy/science literacy}, author = {Roberts, Douglas A.}, @@ -2320,13 +2557,13 @@ @article{ross-hellauer2022 } @techreport{ruiter2023, - title = {Automatically Finding and Categorizing Replication Studies}, - author = {Ruiter, Bob de}, - year = {2023}, - month = {11}, - publisher = {arXiv}, - url = {http://arxiv.org/abs/2311.15055}, - doi = {10.48550/arXiv.2311.15055} + title = {Automatically Finding and Categorizing Replication Studies}, + author = {Ruiter, Bob de}, + year = {2023}, + month = {11}, + publisher = {arXiv}, + url = {http://arxiv.org/abs/2311.15055}, + doi = {10.48550/arXiv.2311.15055} } @article{schmidt2009, @@ -2385,6 +2622,15 @@ @book{smaldino2023 address = {Princeton} } +@techreport{sparceurope2019, + title = {Using open and FAIR data to increase research efficiency}, + author = {SPARC Europe}, + year = {2019}, + date = {2019}, + url = {https://sparceurope.org/wp-content/uploads/dlm_uploads/2019/04/SPARC-Europe_Brief_ODEfficiency.pdf} +} + + @article{squicciarini2021, title = {Demand for AI skills in jobs: Evidence from online job postings}, author = {Squicciarini, Mariagrazia and Nachtigall, Heike}, @@ -2394,6 +2640,7 @@ @article{squicciarini2021 note = {Publisher: OECD} } + @inproceedings{stavropoulos2023, title = {Empowering Knowledge Discovery from Scientific Literature: A novel approach to Research Artifact Analysis}, author = {Stavropoulos, Petros and Lyris, Ioannis and Manola, Natalia and Grypari, Ioanna and Papageorgiou, Harris}, @@ -2403,6 +2650,7 @@ @inproceedings{stavropoulos2023 url = {https://aclanthology.org/2023.nlposs-1.5/} } + @article{stroebe2019, title = {What Can We Learn from Many Labs Replications?}, author = {Stroebe, Wolfgang}, @@ -2418,6 +2666,7 @@ @article{stroebe2019 langid = {en} } + @book{suber2012, title = {Open access}, author = {Suber, Peter}, @@ -2427,6 +2676,7 @@ @book{suber2012 url = {https://library.oapen.org/handle/20.500.12657/26065} } + @article{sugimoto2011, title = {Academic genealogy as an indicator of interdisciplinarity: An examination of dissertation networks in Library and Information Science}, author = {Sugimoto, Cassidy R. and Ni, Chaoqun and Russell, Terrell G. and Bychowski, Brenna}, @@ -2440,6 +2690,7 @@ @article{sugimoto2011 url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/asi.21568} } + @techreport{sveinsdottir2021, title = {An Analysis of Open Science Policies in Europe, v7}, author = {Sveinsdottir, Thordis and Davidson, Joy and Proudman, Vanessa}, @@ -2450,6 +2701,7 @@ @techreport{sveinsdottir2021 note = {DOI: 10.5281/zenodo.4725817} } + @article{szomszor2022, title = {Overton: A bibliometric database of policy document citations}, author = {Szomszor, Martin and Adie, Euan}, @@ -2464,6 +2716,8 @@ @article{szomszor2022 url = {https://doi.org/10.1162/qss_a_00204} } + + @book{tashakkori2021, title = {Foundations of mixed methods research: integrating quantitative and qualitative approaches in the social and behavioral sciences}, author = {Tashakkori, Abbas and Johnson, R. Burke and Teddlie, Charles}, @@ -2474,6 +2728,7 @@ @book{tashakkori2021 address = {Los Angeles London New Delhi Singapore Washington DC Melbourne} } + @article{tattersall2018, title = {What Can Altmetric.com Tell Us About Policy Citations of Research? An Analysis of Altmetric.com Data for Research Articles from the University of Sheffield}, author = {Tattersall, Andy and Carroll, Christopher}, @@ -2488,6 +2743,8 @@ @article{tattersall2018 langid = {English} } + + @article{tennant2016, title = {The academic, economic and societal impacts of Open Access: an evidence-based review}, author = {Tennant, Jonathan P. and Waldner, {François} and Jacques, Damien C. and Masuzzo, Paola and Collister, Lauren B. and Hartgerink, Chris H. J.}, @@ -2498,6 +2755,7 @@ @article{tennant2016 url = {https://f1000research.com/articles/5-632} } + @article{tiokhin_shifting_2023, title = {Shifting the {Level} of {Selection} in {Science}}, issn = {1745-6916}, @@ -2526,6 +2784,7 @@ @article{tomkins_reviewer_2017 pages = {12708--12713} } + @misc{traag_causal_2022, title = {Causal foundations of bias, disparity and fairness}, url = {http://arxiv.org/abs/2207.13665}, @@ -2561,6 +2820,13 @@ @article{traag2021 langid = {en} } +@techreport{tripp2011, + title = {The Economic Impacts of Human Genome Project}, + author = {Tripp, Simon and Grueber, Martin}, + year = {2011}, + date = {2011} +} + @article{valladares2021, title = {Scientific Literacy and Social Transformation}, author = {Valladares, Liliana}, @@ -2586,6 +2852,18 @@ @article{van_den_besselaar_perverse_2017 pages = {905--918} } +@article{vanvlijmen2020, + title = {The Need of Industry to Go FAIR}, + author = {van Vlijmen, Herman and Mons, Albert and Waalkens, Arne and Franke, Wouter and Baak, Arie and Ruiter, Gerbrand and Kirkpatrick, Christine and da Silva Santos, Luiz Olavo Bonino and Meerman, Bert and Jellema, Renger and Arts, Derk and Kersloot, Martijn and Knijnenburg, Sebastiaan and Lusher, Scott and Verbeeck, Rudi and Neefs, Jean-Marc}, + year = {2020}, + date = {2020}, + journal = {Data Intelligence}, + pages = {276--284}, + volume = {2}, + number = {1-2}, + doi = {10.1162/dint_a_00050} +} + @book{venturini2021, title = {Controversy Mapping: A Field Guide}, author = {Venturini, Tommaso and Munk, Anders Kristian}, @@ -2683,7 +2961,6 @@ @article{waltman2019 doi = {10.1007/978-3-030-02511-3_11} } - @article{wang2020, title = {Consistency and validity of interdisciplinarity measures}, author = {Wang, Qi and Schneider, Jesper Wiborg}, @@ -2697,7 +2974,6 @@ @article{wang2020 note = {Publisher: MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info {\ldots}} } - @article{way2019, title = {Productivity, prominence, and the effects of academic environment}, author = {Way, Samuel F. and Morgan, Allison C. and Larremore, Daniel B. and Clauset, Aaron}, @@ -2709,6 +2985,17 @@ @article{way2019 doi = {10.1073/pnas.1817431116} } +@article{wehn2021, + title = {Impact assessment of citizen science: state of the art and guiding principles for a consolidated approach}, + author = {Wehn, Uta and Gharesifard, Mohammad and Ceccaroni, Luigi and Joyce, Hannah and Ajates, Raquel and Woods, Sasha and Bilbao, Ane and Parkinson, Stephen and Gold, Margaret and Wheatland, Jonathan}, + year = {2021}, + date = {2021}, + journal = {Sustainability Science}, + pages = {1683--1699}, + volume = {16}, + number = {5}, + doi = {10.1007/s11625-021-00959-2} +} @article{westreich2013, title = {The Table 2 Fallacy: Presenting and Interpreting Confounder and Modifier Coefficients}, @@ -2724,14 +3011,12 @@ @article{westreich2013 url = {https://doi.org/10.1093/aje/kws412} } - @misc{whatper, title = {What personal data is considered sensitive? - European Commission}, url = {https://commission.europa.eu/law/law-topic/data-protection/reform/rules-business-and-organisations/legal-grounds-processing-data/sensitive-data/what-personal-data-considered-sensitive_en}, langid = {en} } - @article{wilkinson2016, title = {The FAIR Guiding Principles for scientific data management and stewardship}, author = {Wilkinson, Mark D. and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E. and Bouwman, Jildau and Brookes, Anthony J. and Clark, Tim and Crosas, {Mercè} and Dillo, Ingrid and Dumon, Olivier and Edmunds, Scott and Evelo, Chris T. and Finkers, Richard and Gonzalez-Beltran, Alejandra and Gray, Alasdair J. G. and Groth, Paul and Goble, Carole and Grethe, Jeffrey S. and Heringa, Jaap and {{\textquoteright}t Hoen}, Peter A. C. and Hooft, Rob and Kuhn, Tobias and Kok, Ruben and Kok, Joost and Lusher, Scott J. and Martone, Maryann E. and Mons, Albert and Packer, Abel L. and Persson, Bengt and Rocca-Serra, Philippe and Roos, Marco and van Schaik, Rene and Sansone, Susanna-Assunta and Schultes, Erik and Sengstag, Thierry and Slater, Ted and Strawn, George and Swertz, Morris A. and Thompson, Mark and van der Lei, Johan and van Mulligen, Erik and Velterop, Jan and Waagmeester, Andra and Wittenburg, Peter and Wolstencroft, Katherine and Zhao, Jun and Mons, Barend}, @@ -2748,7 +3033,6 @@ @article{wilkinson2016 langid = {en} } - @article{wilkinson2016a, title = {The FAIR Guiding Principles for scientific data management and stewardship}, author = {Wilkinson, Mark D. and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E. and Bouwman, Jildau and Brookes, Anthony J. and Clark, Tim and Crosas, {Mercè} and Dillo, Ingrid and Dumon, Olivier and Edmunds, Scott and Evelo, Chris T. and Finkers, Richard and Gonzalez-Beltran, Alejandra and Gray, Alasdair J. G. and Groth, Paul and Goble, Carole and Grethe, Jeffrey S. and Heringa, Jaap and {{\textquoteright}t Hoen}, Peter A. C. and Hooft, Rob and Kuhn, Tobias and Kok, Ruben and Kok, Joost and Lusher, Scott J. and Martone, Maryann E. and Mons, Albert and Packer, Abel L. and Persson, Bengt and Rocca-Serra, Philippe and Roos, Marco and van Schaik, Rene and Sansone, Susanna-Assunta and Schultes, Erik and Sengstag, Thierry and Slater, Ted and Strawn, George and Swertz, Morris A. and Thompson, Mark and van der Lei, Johan and van Mulligen, Erik and Velterop, Jan and Waagmeester, Andra and Wittenburg, Peter and Wolstencroft, Katherine and Zhao, Jun and Mons, Barend}, @@ -2765,7 +3049,6 @@ @article{wilkinson2016a langid = {en} } - @article{wilner, title = {Complete recovery of values in Diophantine systems (CORVIDS)}, author = {Wilner, Sean and Wood, Katherine and Simons, Daniel J.}, @@ -2773,8 +3056,6 @@ @article{wilner langid = {en-us} } - - @techreport{wilsdon_metric_2015, title = {Metric {Tide}: {Report} of the {Independent} {Review} of the {Role} of {Metrics} in {Research} {Assessment} and {Management}}, institution = {Higher Education Funding Council for England}, @@ -2784,7 +3065,6 @@ @techreport{wilsdon_metric_2015 pages = {163} } - @book{wood2021, title = {CORVIDS}, author = {Wood, Katherine}, @@ -2795,8 +3075,6 @@ @book{wood2021 note = {original-date: 2018-01-29T16:15:29Z} } - - @article{woods2022, title = {Incentivising research data sharing: a scoping review}, author = {Woods, Helen Buckley and Pinfield, Stephen}, @@ -2810,6 +3088,16 @@ @article{woods2022 url = {https://wellcomeopenresearch.org/articles/6-355/v2} } +@article{woodward2003, + title = {Critical Notice: Causality by Judea Pearl}, + author = {Woodward, James}, + year = {2003}, + journal = {Economics & Philosophy}, + pages = {321--340}, + volume = {19}, + number = {2}, + doi = {10.1017/S0266267103001184} +} @article{wuchty2007, title = {The Increasing Dominance of Teams in Production of Knowledge}, @@ -2836,6 +3124,21 @@ @article{yarkoni2019 url = {https://psyarxiv.com/jqw35/} } +@article{yozwiak_data_2015, + title = {Data sharing: {Make} outbreak research open access}, + volume = {518}, + issn = {1476-4687}, + shorttitle = {Data sharing}, + doi = {10.1038/518477a}, + language = {en}, + number = {7540}, + urldate = {2024-12-19}, + journal = {Nature}, + author = {Yozwiak, Nathan L. and Schaffner, Stephen F. and Sabeti, Pardis C.}, + month = feb, + year = {2015}, + pages = {477--479} +} @article{zahedi2017, title = {Mendeley readership as a filtering tool to identify highly cited publications}, @@ -2864,3 +3167,14 @@ @article{zahedi2020 url = {https://ll-j-sar.ubiquityjournal.website/articles/10.29024/sar.20}, langid = {canadian} } + +@inproceedings{zeleti2014, + title = {Capability Matrix for Open Data}, + author = {Zeleti, Fatemeh Ahmadi and Ojo, Adegboyega}, + editor = {Camarinha-Matos, Luis M. and Afsarmanesh, Hamideh}, + year = {2014}, + publisher = {Springer}, + pages = {498--509}, + doi = {10.1007/978-3-662-44745-1_50}, + address = {Berlin, Heidelberg} +} diff --git a/sections/0_causality/figures/DAG_open_data_cost_savings-0.png b/sections/0_causality/figures/DAG_open_data_cost_savings-0.png new file mode 100644 index 0000000..98227bd Binary files /dev/null and b/sections/0_causality/figures/DAG_open_data_cost_savings-0.png differ diff --git a/sections/0_causality/figures/DAG_open_data_cost_savings-1.png b/sections/0_causality/figures/DAG_open_data_cost_savings-1.png new file mode 100644 index 0000000..b7a6c5b Binary files /dev/null and b/sections/0_causality/figures/DAG_open_data_cost_savings-1.png differ diff --git a/sections/0_causality/figures/DAG_open_data_cost_savings-2.png b/sections/0_causality/figures/DAG_open_data_cost_savings-2.png new file mode 100644 index 0000000..7ff3c1a Binary files /dev/null and b/sections/0_causality/figures/DAG_open_data_cost_savings-2.png differ diff --git a/sections/0_causality/figures/DAG_open_data_cost_savings-3.png b/sections/0_causality/figures/DAG_open_data_cost_savings-3.png new file mode 100644 index 0000000..f90d48c Binary files /dev/null and b/sections/0_causality/figures/DAG_open_data_cost_savings-3.png differ diff --git a/sections/0_causality/figures/DAG_open_data_cost_savings.tex b/sections/0_causality/figures/DAG_open_data_cost_savings.tex new file mode 100644 index 0000000..e8b1052 --- /dev/null +++ b/sections/0_causality/figures/DAG_open_data_cost_savings.tex @@ -0,0 +1,135 @@ +\documentclass[tikz]{standalone} + +\input{figure.tex} +\input{styling.tex} + +\usetikzlibrary{calc} + +\begin{document} + +\tikzset{every node/.style={n square}} +\tikzset{every edge/.style={e}} +\tikzstyle{grayed}=[fill=gray!10] +\tikzset{x=1cm,y=1.5cm} + +% FIRST DAG +\begin{tikzpicture} + +% Nodes +\node (open data) at (-3, -1.5) {Open Data}; +\node (cost saving) at (-3,-4) {Cost Saving}; +\node (time saving) at (2, -1.5) {Time Saving}; +\node (user skills) at (2,-3) {User Skills}; +\node (innovation) at (-8, -3) {Innovation}; +\node (collaboration) at (-8, -1.5) {Collaboration}; +\node (infrastructure) at (2, 0) {Technological Infrastructure}; +\node (quality) at (-5, 0) {Data Availability, Quality and Standardisation}; + +% Edges +\draw (quality) edge (open data) + (infrastructure) edge (open data) + (collaboration) edge (open data) + (open data) edge (cost saving) + (open data) edge (time saving) + (collaboration) edge (innovation) + (open data) edge (innovation) + (collaboration) edge (cost saving) + (time saving) edge (cost saving) + (user skills) edge (open data) + (user skills) edge (time saving) + (infrastructure) edge (time saving) + (quality) edge (time saving); + +\end{tikzpicture} + +% SECOND DAG +\begin{tikzpicture} + +% Nodes +\node (open data) at (-3, -1.5) {Open Data}; +\node (cost saving) at (-3,-4) {Cost Saving}; +\node[closed] (time saving) at (2, -1.5) {Time Saving}; +\node[grayed] (user skills) at (2,-3) {User Skills}; +\node[grayed] (innovation) at (-8, -3) {Innovation}; +\node[grayed] (collaboration) at (-8, -1.5) {Collaboration}; +\node[grayed] (infrastructure) at (2, 0) {Technological Infrastructure}; +\node[grayed] (quality) at (-5, 0) {Data Availability, Quality and Standardisation}; + +% Edges +\draw (quality) edge[e-gray] (open data) + (infrastructure) edge[e-gray] (open data) + (collaboration) edge[e-gray] (open data) + (open data) edge (cost saving) + (open data) edge[e-gray] (time saving) + (collaboration) edge[e-gray] (innovation) + (open data) edge[e-gray] (innovation) + (collaboration) edge[e-gray] (cost saving) + (time saving) edge[e-gray] (cost saving) + (user skills) edge[e-gray] (open data) + (user skills) edge[e-gray] (time saving) + (infrastructure) edge[e-gray] (time saving) + (quality) edge[e-gray] (time saving); + +\end{tikzpicture} + +% THIRD DAG +\begin{tikzpicture} + +% Nodes +\node (open data) at (-3, -1.5) {Open Data}; +\node (cost saving) at (-3,-4) {Cost Saving}; +\node[grayed] (time saving) at (2, -1.5) {Time Saving}; +\node[grayed] (user skills) at (2,-3) {User Skills}; +\node[open,conditioned] (innovation) at (-8, -3) {Innovation}; +\node[open] (collaboration) at (-8, -1.5) {Collaboration}; +\node[grayed] (infrastructure) at (2, 0) {Technological Infrastructure}; +\node[grayed] (quality) at (-5, 0) {Data Availability, Quality and Standardisation}; + +% Edges +\draw (quality) edge[e-gray] (open data) + (infrastructure) edge[e-gray] (open data) + (collaboration) edge (open data) + (open data) edge (cost saving) + (open data) edge[e-gray] (time saving) + (collaboration) edge (innovation) + (open data) edge (innovation) + (collaboration) edge (cost saving) + (time saving) edge[e-gray] (cost saving) + (user skills) edge[e-gray] (open data) + (user skills) edge[e-gray] (time saving) + (infrastructure) edge[e-gray] (time saving) + (quality) edge[e-gray] (time saving); + +\end{tikzpicture} + +% FOURTH DAG +\begin{tikzpicture} + +% Nodes +\node (open data) at (-3, -1.5) {Open Data}; +\node (cost saving) at (-3,-4) {Cost Saving}; +\node[open] (time saving) at (2, -1.5) {Time Saving}; +\node[closed,conditioned] (user skills) at (2,-3) {User Skills}; +\node[closed] (innovation) at (-8, -3) {Innovation}; +\node[closed,conditioned] (collaboration) at (-8, -1.5) {Collaboration}; +\node[closed,conditioned] (infrastructure) at (2, 0) {Technological Infrastructure}; +\node[closed,conditioned] (quality) at (-5, 0) {Data Availability, Quality and Standardisation}; + +% Edges +\draw (quality) edge[e-gray] (open data) + (infrastructure) edge[e-gray] (open data) + (collaboration) edge[e-gray] (open data) + (open data) edge (cost saving) + (open data) edge (time saving) + (collaboration) edge[e-gray] (innovation) + (open data) edge[e-gray] (innovation) + (collaboration) edge[e-gray] (cost saving) + (time saving) edge (cost saving) + (user skills) edge[e-gray] (open data) + (user skills) edge[e-gray] (time saving) + (infrastructure) edge[e-gray] (time saving) + (quality) edge[e-gray] (time saving); + +\end{tikzpicture} + +\end{document} diff --git a/sections/0_causality/figures/ODCA_model.tex b/sections/0_causality/figures/ODCA_model.tex index ced6407..12e57bb 100644 --- a/sections/0_causality/figures/ODCA_model.tex +++ b/sections/0_causality/figures/ODCA_model.tex @@ -8,7 +8,7 @@ \begin{document} \begin{tikzpicture}[x=100,y=100, every node/.style={n square}, - every edge/.style={e}] + every edge/.style={e-gray}] \node (OD policy) at (-2,0.5) {OD policy}; \node (open data) at (-1.1,0.5) {open data}; diff --git a/sections/0_causality/figures/styling.tex b/sections/0_causality/figures/styling.tex index b3f2bf5..a7562ec 100644 --- a/sections/0_causality/figures/styling.tex +++ b/sections/0_causality/figures/styling.tex @@ -9,7 +9,8 @@ \definecolor{Set1-A}{RGB}{228, 26, 28} \definecolor{Set1-B}{RGB}{ 55, 126, 184} -\tikzstyle{e}=[rounded corners,very thick,draw=white,double=gray,arrows={-latex[gray]}] +\tikzstyle{e}=[rounded corners,very thick,draw=white,double=black,arrows={-latex[black]}] +\tikzstyle{e-gray}=[e,draw=white,double=lightgray,arrows={-latex[lightgray]}] \tikzset{every label/.style={rectangle,fill=none,draw=none, label distance=4pt}} \tikzstyle{n}=[circle,fill=white,draw=white,line width=2pt,outer sep=0pt,inner sep=0pt, minimum size=4ex] diff --git a/sections/0_causality/open_data_cost_savings.qmd b/sections/0_causality/open_data_cost_savings.qmd new file mode 100644 index 0000000..b69bd45 --- /dev/null +++ b/sections/0_causality/open_data_cost_savings.qmd @@ -0,0 +1,99 @@ +--- +author: + - name: Marla Scorrano + affiliations: + - ref: csil + - name: E. Delugas + orcid: 0009-0000-4638-1062 + affiliations: + - ref: csil + +affiliations: +- id: csil + name: Centre for Industrial Studies + city: Milan + country: Italy +--- + +# The effect of Open Data on cost savings {#the-effect-of-open-data-on-cost-savings .unnumbered} + +History (please fill out in reverse chronological order, latest revision on top): + +| | | | | +|------------------|------------------|------------------|------------------| +| Version | Revision date | Revision | Author | +| | | | | +| 1.3 | 2024-12-09 | Second draft | M. Scorrano, E. Delugas, G. Catalano (reviewer) | +| 1.2 | 2024-11-27 | Peer review | V.A. Traag | +| 1.1 | 2024-11-22 | First draft | M. Scorrano, E. Delugas, S. Vignetti (reviewer) | +| 1.0 | 2024-10-07 | Template outline | E. Delugas | + +## Literature background + +Measuring the economic impact of open science and open data has proven to be challenging. Many theoretical studies highlight the benefits of making research results public, with strong support for Open Science from economic research on technological change [@chataway2018,@yozwiak_data_2015,@mazzucato_entrepreneurial_2011]. However, only few studies have attempted to measure the impacts of open science compared to closed science, and more robust evidence on how Open Science drives innovation and economic outcomes is needed to strengthen support and counter emerging criticisms [@karasz2024,@ali-khan2018]. The existing literature mainly concentrates on specific sectors, particularly health, medicine, and biosciences, which receive more attention due to early regulation by funders and significant interest in clinical trial outcomes. Another important stream of literature is focused on highlighting the economic value of Open Science through personal industry experiences, though lacking precise quantitative evidence, with contributions from @mcmanamay_openaccess_2014 on fisheries, @harding2017 on medicine, @chan_cost_2015 on the transition to an Open Science model, and @chen2017 on the role of open data in AI and machine learning applications. Although directly linking economic outcomes to open data initiatives can be challenging, with authors combining theoretical arguments and the limited quantitative evidence available at the time of their publication [@ali2022,@tennant2016,@fell2019,@wehn2021,@arzberger_promoting_2004], open access to findings and data is considered to lead to significant savings in access costs. By removing paywalls and subscription fees, open data allows researchers and businesses to access valuable information without incurring additional costs. A major economic benefit of lowering the cost of knowledge is the availability of an extra budget that can be reallocated for other purposes [@tennant2016]. + +By eliminating barriers to data access, organisations can reduce the time spent on data collection and focus on core activities. @fell2019 notes that open science reduces the time associated with accessing new knowledge, directly contributing to enhanced research quality and productivity increases. Although the specific time savings associated with open access were not directly tested, @parsons_benefits_2011 conducted an interview-based study that supports this potential benefit. [@beagrie2014] demonstrate that data sharing and curation significantly enhance research efficiency, with labour cost savings ranging from two to over twenty times the operational costs of the data centres. While there are indirect costs associated with preparing data for sharing, such as time and expenses incurred by depositors, the benefits realised by users through improved efficiency and their willingness to pay for access far outweigh these costs [@beagrie2012]. Additionally, findings from [@beagrie2016,@beagrie2021] provide compelling evidence of the timesaving and efficiency gains associated with using European Bioinformatics Institute services (EMBL-EBI). + +Open data minimises redundant data collection and mitigates the "file drawer effect," where valuable findings remain unpublished and inaccessible, ultimately impeding research effectiveness [@assen2014]. According to [@sparceurope2019], making research data openly available can save up to 9% of a project's costs by preventing unnecessary data collection and facilitating the efficient reuse of existing data. Additionally, @houghton2011 estimate that access barriers to academic research in Denmark cost DKK 540 million annually. This figure is based on the average time spent (51-63 minutes) attempting to access research articles and the study highlights that delays in accessing academic research can prolong product and process development by an average of 2.2 years, resulting in significant financial losses for firms. + +The broader implications of open data extend beyond mere cost and time savings. Scientific literature is widely recognised as an important source of strategic knowledge, facilitating the exploration of new ideas in industrial research and innovation, particularly for small and medium enterprises that may struggle to obtain data independently [@EU2020]. However, inefficiencies in traditional publishing models, such as delays and biases in data dissemination, can negatively impact private research productivity, as discussed by @harding2017. Open data benefits various sectors, including agriculture, the environment, forensics, and industrial biotechnology, by providing access to information that helps researchers understand their fields and build on existing work [@fell2019,@tennant2016,@tripp2011,@arshad2016,@yozwiak_data_2015]. + +To fully harness the potential of open data, it is important to develop the necessary skills and capacities to manage it effectively. @zeleti2014 emphasise the need to streamline data generation processes to produce insights that inform and shape business strategies. Given that open data initiatives contribute to greater transparency and accountability, businesses that leverage open data achieve cost savings by co-creating and integrating data from multiple sources to enhance their services [@davies2016,@lindman2014]. @fell2019 suggests that adopting an open approach fosters connections and encourages collaborations that might not occur or would take longer in a closed environment. This is exemplified by the work of [@vanvlijmen2020], who show how the integration of diverse types of open data, specifically scientific, clinical, and experimental public evidence, can be achieved using advanced AI platforms like Euretos. By combining these various data sources, the platform enhances the depth of information available for analysis, enabling more accurate predictions regarding drug efficacy, as demonstrated by a machine learning model that improved prediction accuracy by 12 percentage points over previous state-of-the-art. + +Despite the theoretical benefits of open data, several limitations hinder a comprehensive assessment of its economic impact fully. Implementing open data practices requires significant investments in infrastructure, technology, and training, potentially offsetting some cost savings [@vanvlijmen2020]. Moreover, the European Commission study emphasises that the benefits of open data are contingent on the quality and standardisation of the data provided [@EC-DGRI2018]. Finally, a major limitation is the scarcity of empirical evidence; few studies have attempted to measure the impacts of open science compared to closed science, making it challenging to generalise findings [@karasz2024]. @herala2016 review the benefits and challenges of open data initiatives in the private sector, highlighting advantages like enhanced collaboration and innovation, but caution that these are often based on speculative assumptions rather than empirical evidence, emphasising the need for further research to inform best practices and mitigate risks associated with increased costs and data privacy concerns. + +## Directed Acyclic Graph (DAG) + +As discussed in the general [introduction on causal inference](causal_intro/article/intro-causality.qmd), we use DAGs to represent structural causal models. In the following, a DAG ([@fig-model]) is employed to examine the causal relationship between *Open Data* and *Cost Savings*. The visual illustrates multiple potential pathways, including a direct path from Open Data and Cost Savings, an indirect one involving Time Savings (i.e., a mediator), and additional paths that incorporate factors affecting either Open Data or Time Savings (i.e., confounders). These additional factors, such as technological infrastructure, data quality and availability, standardisation, user skills, innovation, and collaboration introduce layers of complexity to the model. As we will show in the subsequent sections, they are essential to discuss the causal and non-causal, open and closed, relationships among all these variables. + +![Hypothetical structural causal model on Open Data](figures/DAG_open_data_cost_savings-0.png){#fig-model} + +## The effect of Open data on Cost Saving + +In this section, we apply the concepts presented in the section Causality in Science Studies to potential research questions. We present a specific perspective on causal inference through the lens of structural causal models [@pearl_causality_2009]. + +Suppose we are interested in assessing the *total causal effect* of *Open Data* on *Cost Saving*. According to our model ([@fig-model]), there are multiple pathways from Open Data to Cost Saving, some are causal, some are not. To estimate the causal effect of interest, we need to make sure that all causal paths are open, and all non-causal paths are closed. Within the DAG representation, two causal pathways can be identified: a *direct* pathway of *Open Data* $\rightarrow$ *Cost Saving*, representing the direct effect of Open Data on Cost Savings, and an *indirect* pathway *Open Data* $\rightarrow$ *Time Saving* $\rightarrow$ *Cost Saving*, where the effect is indirect and mediated by *Time Saving*. The *direct* effect captures the immediate benefits of providing free access to datasets, while the *indirect* effect, mediated by Time Savings, strengthens the relationship by triggering additional efficiencies that also lead to Cost Savings. + +To properly estimate the *total* causal effect of Open Data on Cost Saving, an empirical model should not control for Time saving. On the contrary, if the model conditions on Time Savings, even implicitly (e.g., by accounting for approaches and tools that optimise and speed up data access and processing), it closes the causal path and introduces biases into the estimation of the total effect ([@fig-mediator]). + +![DAG illustrating the misleading effect of conditioning on the mediator variable, Time Saving. Nodes that are controlled for have a thick outline. Grey nodes represent variables not considered in this figure. Green nodes are open, indicating they allow associations or relationships to flow through them along the paths they connect, while orange nodes are closed, blocking associations or relationships from flowing through the paths they connect. Black arrows represent potential causal influence, whereas grey arrows indicate indirect association that may involve non-causal relationship.](figures/DAG_open_data_cost_savings-1.png){#fig-mediator} + +As mentioned before, the proposed model accounts for additional variables such as *Data Availability, Quality and Standardisation*, *Users' Skills*, *Collaboration*, and *Technological Infrastructure*. These variables can act as *confounders* along different pathways illustrated in [@fig-model]. Examples of non-causal paths represented in the DAG are: + +- *Open Data* $\leftarrow$ *Technological Infrastructure* $\rightarrow$ *Time Saving* $\rightarrow$ *Cost Saving* +- *Open Data* $\leftarrow$ *Data Availability, Quality and Standardisation* $\rightarrow$ *Time Saving* $\rightarrow$ *Cost Saving* +- *Open Data* $\leftarrow$ *User’s skills* $\rightarrow$ *Time Saving* $\rightarrow$ *Cost Saving* + +In these pathways, to correctly identify the causal effect of *Open Data* on *Time Saving*, and by extension, on *Cost Savings*, it is required to control for these confounders. This is because the confounders jointly affect Open Data and Time Savings and omitting them from empirical models results in omitted variable bias. In the proposed example, to correct identify the causal effect one should control for *Technological infrastructure*, *Data availability, quality and standardisation, and the skills of users.* + +There might be instances where a confounder is not observed because is not included in the dataset or not observable at all. In such cases, the non-causal path remains open, resulting in biased conclusions. In fact, unobservable factors might be correlated with observable variables, leaving causal paths unexplored. As a result, if these confounders are not accounted for, we are unable to fully isolate the causal effect of *Open Data* influences *Cost saving*. + +Another case that makes not possible the identification of the causal effect is erroneously controlling for a collider (see the introduction for further details). In the pathway *Open Data* $\rightarrow$ *Innovation* $\leftarrow$ *Collaboration* $\rightarrow$ *Cost Saving,* the variable *Innovation* acts as a collider. Hence, this path is already closed, and the bias arises when controlling for innovation. + +Empirically speaking, in a model where to estimate the causal effect of Open Data on Cost savings we condition on Innovation (and not for Collaboration), it is likely to get a downward estimation of the causal effect, since both Collaboration and Open Data have a positive impact on Innovation. This conditioning opens up the non-causal pathway, *Open Data* $\rightarrow$ *Innovation* $\leftarrow$ *Collaboration* $\rightarrow$ *Cost Saving*, which connect *Open Data* and *Cost Saving* through *Collaboration*, creating a spurious association and distorting the true effect of *Open Data* on *Cost Saving*. This is an example of *bad controls* [@angrist2009], a concept explained in the general introduction. Only by ignoring the collider, meaning non-conditioning on it in empirical models, we can effectively isolate the causal effect. This non-causal path is open because *Innovation* is open (because it is a collider that is conditioned on), and because *Collaboration* is open (because it is a confounder that is not conditioned on) (see @fig-collider). + +![DAG illustrating the misleading effect of conditioning on a collider variable, *Innovation,* and not conditioning on a confounder, *Collaboration*. Nodes that are controlled for have a thick outline. Grey nodes represent variables not considered in this figure. Green nodes are open, indicating they allow associations or relationships to flow through them along the paths they connect, while orange nodes are closed, blocking associations or relationships from flowing through the paths they connect. Black arrows represent potential causal influence, whereas grey arrows indicate indirect association that may involve non-causal relationship.](figures/DAG_open_data_cost_savings-2.png){#fig-collider} + +In addition, *Collaboration* acts as a confounder on the non-causal path *Open data* $\leftarrow$ *Collaboration* $\rightarrow$ *Cost saving*. To identify the causal effect, we hence need to close this non-causal path by conditioning on *Collaboration*. After controlling for Collaboration, whether *Innovation* is conditioned on is then irrelevant for the identification of the causal effect. When all non-causal paths are closed, the research design is said to meet the *backdoor* criterion, a formal requirement that ensures the design blocks all non-causal paths between the treatment (*Open Data*) and the outcome (*Cost Saving*), enabling us to identify the causal effect in question [@cunningham_causal_2021] ([@fig-adjustment]). + +![DAG illustrating the total effect *Open Data* on *Cost Saving*, conditioning on confounders and not on mediator and collider. Nodes that are controlled for have a thick outline. Green nodes are open, indicating they allow associations or relationships to flow through them along the paths they connect, while orange nodes are closed, blocking associations or relationships from flowing through the paths they connect. Black arrows represent potential causal influence, whereas grey dashed arrows indicate indirect association that may involve non-causal relationship.](figures/DAG_open_data_cost_savings-3.png){#fig-adjustment} + +This example highlights key components of causal inference: controlling for confounders (*Data availability, quality and standardisation*, *User skills*, *Collaboration*, and *Technological Infrastructure*), not controlling for mediators (*Time saving*), and not controlling for colliders (*Innovation*), as shown in @fig-adjustment. Constructing an appropriate DAG is important when aiming to draw causal conclusions. Without making assumptions explicit via a DAG, it would be unclear which variables should be controlled for and which not. Omitting important variables weakens the study's ability to draw accurate conclusions about cause and effect. Moreover, adding complexity to a DAG does not always change the variables that need to be controlled for when identifying the causal effect. In some cases, such as when adding confounders between unrelated variables, the identification of the relationship between Open data and Cost savings remains unaffected. However, if a confounder is introduced between Open data and Cost savings directly, it becomes necessary to control for it. + +In the [causality introduction](causal_intro/article/intro-causality.qmd) we emphasise the importance of carefully selecting variables when analysing causal relationships. The introduction warns against two common errors: relying on the data (e.g., through stepwise regression) to decide which variables to control for, or including all available variables, which @mcelreath2020 refers to as "causal salad". Both approaches can lead to incorrect conclusions. Specifically, including mediating variables or focusing only on certain cases could obscure the true effect of open data on outcomes like cost savings. In this regard, @pearl_causality_2009 proposes using the *do-operator*[^1] to define causal effects, where an intervention on one variable allows us to observe changes in another, thereby illuminating causal connections within a system. + +[^1]: The *do-operator* is a notation used in causal inference to denote an intervention in a system. Written as , it represents setting variable to a specific value, simulating the effect of this intervention on other variables in the system while breaking any causal connections that usually determine . This approach allows us to differentiate causation from correlation, estimate causal effects, and answer hypothetical "what-if" scenarios. Through *do-calculus*, a set of rules introduced by Pearl, interventional distributions involving the *do-operator* can be converted into observational distributions. + +This approach is effective in predicting how probability distributions shift under controlled changes when the causal structure is known. However, this approach depends on having an established causal graph, limiting its use for exploring causality from scratch. Critics suggest an alternative approach that allows causal discovery through experimentation without prior assumptions about mechanisms [@woodward2003]. This is especially useful in complex fields, such as social and biomedical sciences, where causal relationships are less understood. Considering this, @cunningham_causal_2021 argues that sample selection problems have been recognised long before the introduction of DAGs, with early solutions like @heckman1979, and emphasizes that an atheoretical approach to empiricism is inadequate. He asserts that causal inference requires a deep understanding of the behavioural processes behind the phenomenon being studied, and while DAGs are useful, they cannot replace the need for theoretical knowledge in creating credible identification strategies. Thus, causal inference is not solved by simply collecting more data, but by integrating theory with empirical analysis. + +## Discussing empirical issues + +The presented model illustrates how open data might drive cost savings by focusing on a limited set of variables commonly discussed in the literature. However, this approach presents challenges, as the existing literature reveals a significant gap in empirical studies that specifically measure the economic impact of open data on cost savings. + +Other relevant factors could be included based on additional evidence but are currently excluded to maintain a coherent link with existing empirical findings and to preserve simplicity. While there is general agreement on the potential benefits of open data for cost savings, no study to date has estimated the total effect using a causal identification strategy such as the one presented here. Moreover, the specific causal pathways remain unclear, as little attention has been paid to the intermediate factors influencing the total effect. This lack of empirical evidence on these pathways limits the ability to draw strong conclusions from the model and underscores the need for more focused empirical research. + +It is also important to acknowledge that causal pathways are not static and may evolve depending on the context or field of application. @cunningham_causal_2021 highlights that causal inference requires a deeper understanding of the underlying processes governing the system being studied. It is not solely about the data but also about the theoretical and contextual knowledge of how behaviours, choices, or events interact to produce stable outcomes. Timing and the evolution of causality play a critical role, as interactions between variables change over time, leading to different outcomes in different contexts. This complexity, arising from reverse causality and cyclical interactions, represents a key limitation of this DAG. The relationships between variables are often more intricate, with the potential for bidirectional influences. For example, when organisations share open data, they enable collaborative efforts that can lead to innovative solutions, such as new products or services. Increased collaboration, in turn, may prompt organisations to adopt more open data practices, creating a bidirectional relationship. This innovation can then lead to cost savings by streamlining processes or reducing redundancies. Conversely, independent cost savings may provide the resources necessary for further investment in innovation, illustrating a cyclical relationship where each factor reinforces the other. This dynamic interplay complicates the one-way causal pathways represented in the DAG. Feedback loops and time-dependent changes are often critical in real-world scenarios, making the DAG an initial framework rather than a definitive model. Further empirical research is essential to refine and validate the relationships it proposes, capturing the complexities and nuances of the interactions between these variables. + +A targeted survey could be a viable approach to address the empirical challenges arising from the lack of data and counterfactual evidence in studying the economic impact of open data. This survey should be carefully designed to measure the direct effects of open data on cost savings while accounting for confounding factors. It should aim to answer specific research questions, such as the extent of open data sharing, the perceived efficiency gains from data use, and the direct cost reductions attributed to open data initiatives. The survey would need to collect both longitudinal and cross-sectional data from a diverse sample of organisations that share open data as well as those that do not. This would allow for comparisons of outcomes and the establishment of causal relationships. Key variables to be measured should include the extent of open data sharing, cost reductions, new products or services enabled or accelerated by open data, and changes in collaboration. Questions investigating the perceived causal pathways would also be essential. Incorporating temporal data would provide further insight, enabling a better understanding of causal direction and distinguishing between short-term and long-term effects. + +Access to such detailed data would enable the use of causal inference techniques, such as Propensity Score Matching (PSM) or the Difference-in-Differences (DID) estimator, to identify the true impact of open data on cost savings. By addressing current data gaps, this survey could provide the empirical evidence needed to validate and refine the DAG, enhancing our understanding of how open data drives economic outcomes like cost savings. For instance, in the PSM context, a rich dataset would support the development of a more comprehensive DAG, helping to identify variables to include in the matching process (e.g., data availability, technological infrastructure, and user skills) and those to exclude (e.g., time savings and innovation). This approach would strengthen the empirical basis for analysing the causal impact of open data. \ No newline at end of file