diff --git a/config/tess.example.yml b/config/tess.example.yml index 3f497d934..1ba090e54 100644 --- a/config/tess.example.yml +++ b/config/tess.example.yml @@ -164,6 +164,7 @@ default: &default # biotools, topics, operations, sponsors, fairshare, county, ardc_fields_of_research, # other_types, subsets, syllabus, approved_editors, address_finder disabled: ['ardc_fields_of_research', 'other_types', 'subsets', 'syllabus', 'approved_editors'] + auto_parse_vars: [] # available features to auto parse from description: ['keywords', 'target_audience'] materials_disabled: [] content_providers_disabled: [] bioschemas_testing: false diff --git a/lib/ingestors/auto_parser_mappings/keywords.json b/lib/ingestors/auto_parser_mappings/keywords.json new file mode 100644 index 000000000..4b0d4bd83 --- /dev/null +++ b/lib/ingestors/auto_parser_mappings/keywords.json @@ -0,0 +1,140 @@ +{ +"Research data": "RDM, FAIR principles and open science", +"Scientific disciplines": "RDM, FAIR principles and open science", +"Research methodologies": "RDM, FAIR principles and open science", +"Research data lifecycle": "RDM, FAIR principles and open science", +"Roles and responsibilities in RDM": "RDM, FAIR principles and open science", +"Open science": "RDM, FAIR principles and open science", +"Reproducibility and replicability": "RDM, FAIR principles and open science", +"Scholarly communication": "RDM, FAIR principles and open science", +"Costs for RDM": "RDM, FAIR principles and open science", +"Data management planning": "RDM, FAIR principles and open science", +"FAIR data principles": "RDM, FAIR principles and open science", +"Data discovery": "RDM, FAIR principles and open science", +"Data reuse": "RDM, FAIR principles and open science", +"Data collection": "RDM, FAIR principles and open science", +"Data documentation": "RDM, FAIR principles and open science", +"Data organisation": "RDM, FAIR principles and open science", +"File naming (conventions)": "RDM, FAIR principles and open science", +"Data versioning": "RDM, FAIR principles and open science", +"Data formats and types": "RDM, FAIR principles and open science", +"Data back-up": "RDM, FAIR principles and open science", +"Data selection": "RDM, FAIR principles and open science", +"Data destruction": "RDM, FAIR principles and open science", +"Data preservation and archiving": "RDM, FAIR principles and open science", +"Data publication": "RDM, FAIR principles and open science", +"Data curation": "RDM, FAIR principles and open science", +"Data visualisation": "RDM, FAIR principles and open science", +"Data provenance": "RDM, FAIR principles and open science", +"Metadata (standard)": "RDM, FAIR principles and open science", +"Controlled vocabulary, ontology, taxonomy, thesaurus": "RDM, FAIR principles and open science", +"Linked Open Data and SPARQL": "RDM, FAIR principles and open science", +"FAIR metrics": "RDM, FAIR principles and open science", +"3-point FAIRification Framework (FAIR data point, FAIR Implementation Profile)": "RDM, FAIR principles and open science", +"Persistent identifier": "RDM, FAIR principles and open science", +"Open Archives Initiative Protocol for Metadata Harvesting (OAI-PMH)": "RDM, FAIR principles and open science", +"Spreadsheet tools": "RDM, FAIR principles and open science", +"Data modelling": "RDM, FAIR principles and open science", +"Data integration": "RDM, FAIR principles and open science", +"Data integrity, validation & quality": "RDM, FAIR principles and open science", +"Data cleaning & wrangling": "RDM, FAIR principles and open science", +"Database management": "RDM, FAIR principles and open science", +"Master data management": "RDM, FAIR principles and open science", +"Business intelligence": "RDM, FAIR principles and open science", +"RDM service model": "RDM, FAIR principles and open science", + +"Research software": "Research software management", +"Software management planning": "Research software management", +"Software version control (e.g., git and GitHub)": "Research software management", +"Software documentation": "Research software management", +"Software packaging (R, Python, etc.)": "Research software management", +"Software citation": "Research software management", +"FAIR software": "Research software management", +"Reproducibility": "Research software management", +"Coding conventions": "Research software management", +"Literate programming": "Research software management", +"Scientific workflows and data pipelines": "Research software management", +"Computer programming": "Research software management", +"Virtual environments and Containerisation": "Research software management", +"Continuous integration": "Research software management", +"Use of generative Artificial Intelligence in writing research software": "Research software management", + +"(Certified) Data Repositories": "Data infrastructure", +"Repository quality standards (e.g. CoreTrustSeal, ISAD(G), OAIS reference model)": "Data infrastructure", +"TRUST principles for digital repositories": "Data infrastructure", +"Tool criticism": "Data infrastructure", +"Data collection tools": "Data infrastructure", +"Data management services/tools": "Data infrastructure", +"Data storage (media)": "Data infrastructure", +"Data transfer tools": "Data infrastructure", +"Data analysis software/tools": "Data infrastructure", +"Cloud computing and High-performance computing": "Data infrastructure", +"Data security and Data classification": "Data infrastructure", +"Available RDM infrastructure and organisations": "Data infrastructure", +"European Open Science Cloud solutions": "Data infrastructure", + +"European, national and institutional policies on RDM, RSM and open science": "Policy and governance", +"Funder RDM, RSM and open science requirements": "Policy and governance", +"Journal policies related to RDM": "Policy and governance", +"Policy development": "Policy and governance", +"Policy implementation": "Policy and governance", +"Policy monitoring": "Policy and governance", +"Translating policy to organisational strategy": "Policy and governance", +"Responsible metrics (bibliometrics, altmetrics)": "Policy and governance", +"Digital sovereignty": "Policy and governance", +"Data governance": "Policy and governance", +"Data ownership": "Policy and governance", +"Key Performance Indicators (KPI) for RDM": "Policy and governance", + +"Privacy and data protection (GDPR, UAVG)": "Legal and ethical responsibilities", +"Sensitive data/Confidentiality": "Legal and ethical responsibilities", +"Intellectual property rights": "Legal and ethical responsibilities", +"copyright": "Legal and ethical responsibilities", +"patents": "Legal and ethical responsibilities", +"trademarks": "Legal and ethical responsibilities", +"Research in consortia": "Legal and ethical responsibilities", +"Data and software licenses ": "Legal and ethical responsibilities", +"License compatibility": "Legal and ethical responsibilities", +"Information security": "Legal and ethical responsibilities", +"Knowledge security": "Legal and ethical responsibilities", +"European data legislation": "Legal and ethical responsibilities", +"AI Act": "Legal and ethical responsibilities", +"Data Governance Act": "Legal and ethical responsibilities", +"Data Act": "Legal and ethical responsibilities", +"European Health Data Space": "Legal and ethical responsibilities", +"(Cyber)security legislation, e.g. NIS2 Directive": "Legal and ethical responsibilities", +"Trade Secret Protection Act": "Legal and ethical responsibilities", +"Research ethics and integrity": "Legal and ethical responsibilities", +"CARE principles": "Legal and ethical responsibilities", +"Diversity, equity & inclusion": "Legal and ethical responsibilities", + +"Needs assessment": "Training and awareness raising", +"Instructional design": "Training and awareness raising", +"FAIR-by-Design methodology": "Training and awareness raising", +"Carpentries methodology": "Training and awareness raising", +"Training andragogy ": "Training and awareness raising", +"Didactic methods": "Training and awareness raising", +"Presentation skills": "Training and awareness raising", +"Written communication skills": "Training and awareness raising", +"Open Educational Resources": "Training and awareness raising", +"Diagnostic, formative and summative assessment": "Training and awareness raising", +"Course evaluation ": "Training and awareness raising", +"Student satisfaction": "Training and awareness raising", + +"Networking skills": "Transversal skills", +"Community management": "Transversal skills", +"Existing RDM networks/ communities": "Transversal skills", +"Consultancy": "Transversal skills", +"Advocacy": "Transversal skills", +"Conflict resolution": "Transversal skills", +"Negotiating": "Transversal skills", +"Active listening": "Transversal skills", +"Stakeholder analysis": "Transversal skills", +"Stakeholder engagement": "Transversal skills", +"Organisational development": "Transversal skills", +"Project management (methodologies)": "Transversal skills", +"Change management": "Transversal skills", +"Binding Leadership": "Transversal skills", +"Facilitation": "Transversal skills", +"Teamwork": "Transversal skills" +} \ No newline at end of file diff --git a/lib/ingestors/auto_parser_mappings/target_audience.json b/lib/ingestors/auto_parser_mappings/target_audience.json new file mode 100644 index 000000000..849380c2f --- /dev/null +++ b/lib/ingestors/auto_parser_mappings/target_audience.json @@ -0,0 +1,27 @@ +{ + "post-docs": "researchers", + "PhD's candidate": "researchers", + "PhD student": "researchers", + "principal investigator": "researchers", + "professor": "researchers", + "scientist": "researchers", + + "library staff": "research support staff", + "research librarian": "research support staff", + "information specialist": "research support staff", + "archivist": "research support staff", + "repository manager": "research support staff", + "data steward": "research support staff", + "data manager": "research support staff", + "data professional": "research support staff", + "data engineer": "research support staff", + "software engineer": "research support staff", + "data librarian": "research support staff", + + "bachelor": "students", + "master": "students", + + "teacher": "trainers", + "coaches": "trainers", + "educator": "trainers" +} diff --git a/lib/ingestors/event_ingestion.rb b/lib/ingestors/event_ingestion.rb index 87c94b91f..c72a1470d 100644 --- a/lib/ingestors/event_ingestion.rb +++ b/lib/ingestors/event_ingestion.rb @@ -7,6 +7,10 @@ def add_event(event) c.send(:event_params) event = OpenStruct.new(c.send(:event_params)) end + TeSS::Config.feature['auto_parse_vars'].each do |var| + new_val = auto_parse(var, event.description) + event.send("#{var}=", new_val) + end @events << event unless event.nil? end @@ -22,35 +26,17 @@ def convert_location(input) input end - def parse_audience(description) - audience_mapping = { - 'post-docs': 'researchers', - "PhD's candidate": 'researchers', - 'PhD student': 'researchers', - 'principal investigator': 'researchers', - 'professor': 'researchers', - 'scientist': 'researchers', - 'library staff': 'research support staff', - 'research librarian': 'research support staff', - 'information specialist': 'research support staff', - 'archivist': 'research support staff', - 'repository manager': 'research support staff', - 'data steward': 'research support staff', - 'data manager': 'research support staff', - 'data professional': 'research support staff', - 'data engineer': 'research support staff', - 'software engineer': 'research support staff', - 'data librarian': 'research support staff', - 'bachelor': 'students', - 'master': 'students', - 'teacher': 'trainers', - 'coaches': 'trainers', - 'educator': 'trainers', - } - audience_mapping - .select{ |key, val| description.downcase.include?(key.to_s.downcase) } - .values - .uniq + def auto_parse(var, description) + json_path = File.join(Rails.root, 'lib', 'ingestors', 'auto_parser_mappings', "#{var.to_s}.json") + res = nil + if File.exist?(json_path) + mapping = JSON.parse(File.read(json_path)) + res = mapping + .select{ |key, val| description.downcase.include?(key.to_s.downcase) } + .values + .uniq + end + res end def parse_dates(input, timezone = nil) diff --git a/lib/ingestors/material_ingestion.rb b/lib/ingestors/material_ingestion.rb index 9e8eaea91..d5a93d33d 100644 --- a/lib/ingestors/material_ingestion.rb +++ b/lib/ingestors/material_ingestion.rb @@ -7,7 +7,24 @@ def add_material(material) c.send(:material_params) material = OpenStruct.new(c.send(:material_params)) end + TeSS::Config.feature['auto_parse_vars'].each do |var| + new_val = auto_parse(var, event.description) + event.send("#{var}=", new_val) + end @materials << material unless material.nil? end + + def auto_parse(var, description) + json_path = File.join(Rails.root, 'lib', 'ingestors', 'auto_parser_mappings', "#{var.to_s}.json") + res = nil + if File.exist?(json_path) + mapping = JSON.parse(File.read(json_path)) + res = mapping + .select{ |key, val| description.downcase.include?(key.to_s.downcase) } + .values + .uniq + end + res + end end end diff --git a/lib/ingestors/taxila/dans_ingestor.rb b/lib/ingestors/taxila/dans_ingestor.rb index 409f3ca70..4422b3f57 100644 --- a/lib/ingestors/taxila/dans_ingestor.rb +++ b/lib/ingestors/taxila/dans_ingestor.rb @@ -64,7 +64,6 @@ def process_dans(url) event.source = 'DANS' event.timezone = 'Amsterdam' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/dtls_ingestor.rb b/lib/ingestors/taxila/dtls_ingestor.rb index ae8fc8f47..5df97cb78 100644 --- a/lib/ingestors/taxila/dtls_ingestor.rb +++ b/lib/ingestors/taxila/dtls_ingestor.rb @@ -72,7 +72,6 @@ def process_dtls(url) event.set_default_times event.source = 'DTL' event.timezone = 'Amsterdam' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e @messages << "Extract event fields failed with: #{e.message}" diff --git a/lib/ingestors/taxila/han_ingestor.rb b/lib/ingestors/taxila/han_ingestor.rb index 1b3e3379f..07efd587c 100644 --- a/lib/ingestors/taxila/han_ingestor.rb +++ b/lib/ingestors/taxila/han_ingestor.rb @@ -50,7 +50,6 @@ def process_han(_url) event.venue = "#{venue_super_css.text} #{venue_sub_css.text}" event.source = "HAN" event.timezone = 'Amsterdam' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/lcrdm_ingestor.rb b/lib/ingestors/taxila/lcrdm_ingestor.rb index 3d36d5a62..b9dd727f9 100644 --- a/lib/ingestors/taxila/lcrdm_ingestor.rb +++ b/lib/ingestors/taxila/lcrdm_ingestor.rb @@ -58,7 +58,6 @@ def process_lcrdm(url) event.source = 'LCRDM' event.timezone = 'Amsterdam' event.set_default_times - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/leiden_ingestor.rb b/lib/ingestors/taxila/leiden_ingestor.rb index cffe95d30..e418e1a53 100644 --- a/lib/ingestors/taxila/leiden_ingestor.rb +++ b/lib/ingestors/taxila/leiden_ingestor.rb @@ -83,7 +83,6 @@ def process_leiden(url) # does TeSS support that? event.source = 'Universiteit Leiden' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/maastricht_ingestor.rb b/lib/ingestors/taxila/maastricht_ingestor.rb index f4840d490..09d03ca26 100644 --- a/lib/ingestors/taxila/maastricht_ingestor.rb +++ b/lib/ingestors/taxila/maastricht_ingestor.rb @@ -58,7 +58,6 @@ def process_maastricht(url) event.timezone = 'Europe/Amsterdam' # how to get this from Icalendar Event object? event.source = 'Maastricht University' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/nwo_ingestor.rb b/lib/ingestors/taxila/nwo_ingestor.rb index 9a2811730..61c3cec8c 100644 --- a/lib/ingestors/taxila/nwo_ingestor.rb +++ b/lib/ingestors/taxila/nwo_ingestor.rb @@ -47,7 +47,6 @@ def process_nwo(url) event.url = "https://www.nwo.nl#{event_data.css('h3.card__title > a').attribute('href').value}" event.source = 'NWO' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/odissei_ingestor.rb b/lib/ingestors/taxila/odissei_ingestor.rb index ac47651ce..64fe7c1fd 100644 --- a/lib/ingestors/taxila/odissei_ingestor.rb +++ b/lib/ingestors/taxila/odissei_ingestor.rb @@ -59,7 +59,6 @@ def process_odissei(_url) event.source = 'ODISSEI' event.timezone = 'Amsterdam' event.set_default_times - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e @messages << "Extract event fields failed with: #{e.message}" diff --git a/lib/ingestors/taxila/oscd_ingestor.rb b/lib/ingestors/taxila/oscd_ingestor.rb index b7ab410c1..13fc1f534 100644 --- a/lib/ingestors/taxila/oscd_ingestor.rb +++ b/lib/ingestors/taxila/oscd_ingestor.rb @@ -57,7 +57,6 @@ def process_oscd(url) end if div&.next_sibling&.next_sibling.nil? || (div&.next_sibling&.next_sibling&.name == 'h1') event.set_default_times - event.target_audience = parse_audience(event.description) add_event(event) end end diff --git a/lib/ingestors/taxila/oscm_ingestor.rb b/lib/ingestors/taxila/oscm_ingestor.rb index f3d4127dd..ea68207e6 100644 --- a/lib/ingestors/taxila/oscm_ingestor.rb +++ b/lib/ingestors/taxila/oscm_ingestor.rb @@ -57,7 +57,6 @@ def process_oscm(url) # it's not really needed since dtstart and dtend contain timezone information event.source = 'OSCM' event.online = true - event.target_audience = parse_audience(event.description) # add event to events array add_event(event) diff --git a/lib/ingestors/taxila/rdnl_ingestor.rb b/lib/ingestors/taxila/rdnl_ingestor.rb index d75ca8477..6f4d8fa84 100644 --- a/lib/ingestors/taxila/rdnl_ingestor.rb +++ b/lib/ingestors/taxila/rdnl_ingestor.rb @@ -57,7 +57,6 @@ def process_rdnl(url) event.host_institutions = ['RDNL'] event.timezone = 'Amsterdam' event.set_default_times - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/rug_ingestor.rb b/lib/ingestors/taxila/rug_ingestor.rb index 7707d1cf4..1bdfbd051 100644 --- a/lib/ingestors/taxila/rug_ingestor.rb +++ b/lib/ingestors/taxila/rug_ingestor.rb @@ -50,7 +50,6 @@ def process_rug(url) event.source = 'RUG' event.timezone = 'Amsterdam' event.set_default_times - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/surf_ingestor.rb b/lib/ingestors/taxila/surf_ingestor.rb index 7bcf10879..8188d1f07 100644 --- a/lib/ingestors/taxila/surf_ingestor.rb +++ b/lib/ingestors/taxila/surf_ingestor.rb @@ -56,7 +56,6 @@ def process_surf(url) event.source = 'SURF' event.online = true event.timezone = 'Amsterdam' - event.target_audience = parse_audience(event.description) # add event to events array add_event(event) diff --git a/lib/ingestors/taxila/tdcc_ingestor.rb b/lib/ingestors/taxila/tdcc_ingestor.rb index 0dd6429f8..dd68d7ccd 100644 --- a/lib/ingestors/taxila/tdcc_ingestor.rb +++ b/lib/ingestors/taxila/tdcc_ingestor.rb @@ -59,7 +59,6 @@ def process_tdcc(url) event.start += 1.year event.end += 1.year end - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e diff --git a/lib/ingestors/taxila/utwente_ingestor.rb b/lib/ingestors/taxila/utwente_ingestor.rb index 2b77a0b61..76941ff82 100644 --- a/lib/ingestors/taxila/utwente_ingestor.rb +++ b/lib/ingestors/taxila/utwente_ingestor.rb @@ -52,7 +52,6 @@ def process_utwente(url) event.timezone = 'Amsterdam' event.organizer = 'University of Twente' event.source = 'University of Twente' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e @messages << "Extract event fields failed with: #{e.message}" diff --git a/lib/ingestors/taxila/uu_ingestor.rb b/lib/ingestors/taxila/uu_ingestor.rb index f5e0a00fd..3fd16ab3d 100644 --- a/lib/ingestors/taxila/uu_ingestor.rb +++ b/lib/ingestors/taxila/uu_ingestor.rb @@ -106,7 +106,6 @@ def process_uu(url) event.timezone = 'Amsterdam' # UU wants opt-in instead of opt-out for this scraper event.visible = false - event.target_audience = parse_audience(event.description) # the below code allows fetching the long description, at the cost of a # page load per event. diff --git a/lib/ingestors/taxila/uva_ingestor.rb b/lib/ingestors/taxila/uva_ingestor.rb index 382fc8ea8..f0fd5b0ba 100644 --- a/lib/ingestors/taxila/uva_ingestor.rb +++ b/lib/ingestors/taxila/uva_ingestor.rb @@ -57,7 +57,6 @@ def process_uva(url) event.keywords = attr.fetch('taxonomy', []).map(&:values).flatten event.event_types = attr.fetch('eventType', []).map { |t| convert_event_types(t) } - event.target_audience = parse_audience(event.description) # add event to events array add_event(event) diff --git a/lib/ingestors/taxila/wur_ingestor.rb b/lib/ingestors/taxila/wur_ingestor.rb index d303ed0df..3f39c6ef5 100644 --- a/lib/ingestors/taxila/wur_ingestor.rb +++ b/lib/ingestors/taxila/wur_ingestor.rb @@ -82,7 +82,6 @@ def process_wur(url) event.set_default_times event.source = 'WUR' event.timezone = 'Amsterdam' - event.target_audience = parse_audience(event.description) add_event(event) rescue Exception => e