From 5e531ee14d0cb41747ed1be2fc4dfaa3b36ea11d Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 15:03:35 +0100 Subject: [PATCH 01/19] feat: use IMO website instead of Wikipedia Also fix stock unit counts and make some tests more benevolent. --- .../06_locating_elements.md | 35 +++++++--------- .../exercises/imo_countries.mjs | 42 +++++++++++++++++++ .../exercises/test.bats | 15 +++---- .../exercises/wikipedia_countries.mjs | 29 ------------- .../06_locating_elements.md | 35 +++++++--------- .../exercises/imo_countries.py | 22 ++++++++++ .../exercises/test.bats | 15 +++---- .../exercises/wikipedia_countries.py | 17 -------- 8 files changed, 112 insertions(+), 98 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index d6666fcbd1..b339729ca6 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/locating-elements import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.mjs'; +import IMOCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.mjs'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs'; @@ -212,39 +212,36 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list -### Scrape Wikipedia +### Scrape list of International Maritime Organization members -Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL: +Download International Maritime Organization's page with the list of members, use Cheerio to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx ``` Your program should print the following: ```text +Albania Algeria -Angola -Benin -Botswana -Burkina Faso -Burundi -Cameroon -Cape Verde -Central African Republic -Chad -Comoros -Democratic Republic of the Congo -Republic of the Congo -Djibouti ... +Liberia +Libya +... +Zimbabwe +Faroes +Hong Kong, China +Macao, China ```
Solution - {WikipediaCountriesExercise.code} + {IMOCountriesExercise.code} + + We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first one. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. - Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. + Then we visit each row again and check if it contains more than two cells. If yes, we take the text of the third one, and again, we print it if it's not empty. This way we correctly process the large table with its left and right part.
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs new file mode 100644 index 0000000000..85ee3d071f --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs @@ -0,0 +1,42 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const tableElement of $('.content table').toArray()) { + const $table = $(tableElement); + const rows = $table.find('tr'); + + for (const rowElement of rows.toArray()) { + const $row = $(rowElement); + const cells = $row.find('td'); + + if (cells.length > 0) { + const $firstColumn = $(cells[0]); + const text = $firstColumn.text().trim(); + if (text) { + console.log(text); + } + } + } + + for (const rowElement of rows.toArray()) { + const $row = $(rowElement); + const cells = $row.find('td'); + + if (cells.length > 2) { + const $thirdColumn = $(cells[2]); + const text = $thirdColumn.text().trim(); + if (text) { + console.log(text); + } + } + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 618b64cd14..608e0f1b82 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -31,10 +31,11 @@ teardown_file() { [[ "$output" == "18" ]] } -@test "lists African countries" { - run node wikipedia_countries.mjs +@test "lists IMO countries" { + run node imo_countries.mjs - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'\nLiberia\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -56,7 +57,7 @@ teardown_file() { run node warehouse_units.mjs [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -64,7 +65,7 @@ teardown_file() { run node warehouse_units_regex.mjs [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -72,7 +73,7 @@ teardown_file() { run node guardian_publish_dates.mjs [[ "$output" == *' F1 '* ]] - [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely + [[ "$output" == *' | Mon '* ]] # has info about date, Mondays are very likely [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -134,7 +135,7 @@ teardown_file() { (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "18" ]] + [[ $(cat dataset.json | jq '. | length') -gt 6 ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs deleted file mode 100644 index fd9a7f2fb9..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs +++ /dev/null @@ -1,29 +0,0 @@ -import * as cheerio from 'cheerio'; - -const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; -const response = await fetch(url); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const tableElement of $('.wikitable').toArray()) { - const $table = $(tableElement); - const rows = $table.find('tr'); - - for (const rowElement of rows.toArray()) { - const $row = $(rowElement); - const cells = $row.find('td'); - - if (cells.length > 0) { - const $thirdColumn = $(cells[2]); - const $link = $thirdColumn.find('a').first(); - if ($link.length) { - console.log($link.text()); - } - } - } -} diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 5dce9bc0e6..c37a4c56ca 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/locating-elements import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.py'; +import IMOCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.py'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py'; @@ -217,39 +217,36 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list -### Scrape Wikipedia +### Scrape list of International Maritime Organization members -Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL: +Download International Maritime Organization's page with the list of members, use Cheerio to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx ``` Your program should print the following: ```text +Albania Algeria -Angola -Benin -Botswana -Burkina Faso -Burundi -Cameroon -Cape Verde -Central African Republic -Chad -Comoros -Democratic Republic of the Congo -Republic of the Congo -Djibouti ... +Liberia +Libya +... +Zimbabwe +Faroes +Hong Kong, China +Macao, China ```
Solution - {WikipediaCountriesExercise.code} + {IMOCountriesExercise.code} + + We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first one. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. - Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. + Then we visit each row again and check if it contains more than two cells. If yes, we take the text of the third one, and again, we print it if it's not empty. This way we correctly process the large table with its left and right part.
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py new file mode 100644 index 0000000000..739e288f6d --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py @@ -0,0 +1,22 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx" + +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for table in soup.select(".content table"): + for row in table.select("tr"): + if cells := row.select("td"): + first_column = cells[0] + if text := first_column.text.strip(): + print(text) + for row in table.select("tr"): + if cells := row.select("td"): + if len(cells) > 2: + third_column = cells[2] + if text := third_column.text.strip(): + print(text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 2de3db35f3..07a314548d 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -24,10 +24,11 @@ teardown() { [[ "$output" == "18" ]] } -@test "lists African countries" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py +@test "lists IMO countries" { + run uv run --with=httpx --with=beautifulsoup4 python imo_countries.py - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'\nLiberia\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -49,7 +50,7 @@ teardown() { run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -57,7 +58,7 @@ teardown() { run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -65,7 +66,7 @@ teardown() { run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py [[ "$output" == *' F1 '* ]] - [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely + [[ "$output" == *' | Mon '* ]] # has info about date, Mondays are very likely [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -129,7 +130,7 @@ teardown() { (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "18" ]] + [[ $(cat dataset.json | jq '. | length') -gt 6 ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py deleted file mode 100644 index 0d4769ccbb..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py +++ /dev/null @@ -1,17 +0,0 @@ -import httpx -from bs4 import BeautifulSoup - -url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -response = httpx.get(url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for table in soup.select(".wikitable"): - for row in table.select("tr"): - cells = row.select("td") - if cells: - third_column = cells[2] - link = third_column.select_one("a") - if link: - print(link.text) From 804fa846ad3ca2fa9e58c69813f470bf4c86861a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 15:23:58 +0100 Subject: [PATCH 02/19] feat: improve exercise guidance, shorten the JS code example --- .../06_locating_elements.md | 6 ++++ .../exercises/imo_countries.mjs | 36 ++++++++----------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index b339729ca6..a6052d5c21 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -235,6 +235,12 @@ Hong Kong, China Macao, China ``` +:::tip Need a nudge? + +You may want to check out Cheerio's [`.eq()`](https://cheerio.js.org/docs/api/classes/Cheerio#eq). + +::: +
Solution {IMOCountriesExercise.code} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs index 85ee3d071f..777192e3f6 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs @@ -12,31 +12,23 @@ const $ = cheerio.load(html); for (const tableElement of $('.content table').toArray()) { const $table = $(tableElement); - const rows = $table.find('tr'); - - for (const rowElement of rows.toArray()) { - const $row = $(rowElement); - const cells = $row.find('td'); - - if (cells.length > 0) { - const $firstColumn = $(cells[0]); - const text = $firstColumn.text().trim(); - if (text) { - console.log(text); - } + const rows = $table.find('tr').toArray(); + + for (const rowElement of rows) { + const $cells = $(rowElement).find('td'); + const $firstCell = $cells.eq(0); + const firstCellText = $firstCell.text().trim(); + if (firstCellText) { + console.log(firstCellText); } } - for (const rowElement of rows.toArray()) { - const $row = $(rowElement); - const cells = $row.find('td'); - - if (cells.length > 2) { - const $thirdColumn = $(cells[2]); - const text = $thirdColumn.text().trim(); - if (text) { - console.log(text); - } + for (const rowElement of rows) { + const $cells = $(rowElement).find('td'); + const $thirdCell = $cells.eq(2); + const thirdCellText = $thirdCell.text().trim(); + if (thirdCellText) { + console.log(thirdCellText); } } } From f5aee7ba307997e241feac9ba754f283f4242de3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 15:52:35 +0100 Subject: [PATCH 03/19] fix: price should be in cents --- .../scraping_basics_python/exercises/process_products_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py index 934d347b8a..9854aa7b71 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py @@ -5,5 +5,5 @@ products = json.load(file) for product in products: - if int(product['min_price']) > 500: + if int(product["min_price"]) > 50000: pp(product) From 08b9e3ff164719166f3f01e87c754b37f9eb8e71 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 15:53:37 +0100 Subject: [PATCH 04/19] feat: migrate follow-up exercise from Wikipedia to IMO --- .../06_locating_elements.md | 10 +++++----- .../exercises/imo_countries.mjs | 4 +--- .../imo_countries_single_selector.mjs | 18 ++++++++++++++++++ .../exercises/test.bats | 9 +++++---- .../06_locating_elements.md | 12 ++++++------ .../exercises/imo_countries.py | 2 -- .../exercises/imo_countries_single_selector.py | 12 ++++++++++++ .../scraping_basics_python/exercises/test.bats | 9 +++++---- 8 files changed, 52 insertions(+), 24 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index a6052d5c21..201c7cfb59 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -9,7 +9,7 @@ import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; import IMOCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; -import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.mjs'; +import IMOCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.mjs'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs'; @@ -224,11 +224,11 @@ Your program should print the following: ```text Albania +Libya Algeria +Lithuania ... Liberia -Libya -... Zimbabwe Faroes Hong Kong, China @@ -253,7 +253,7 @@ You may want to check out Cheerio's [`.eq()`](https://cheerio.js.org/docs/api/cl ### Use CSS selectors to their max -Simplify the code from previous exercise. Use a single for loop and a single CSS selector. +Simplify your International Maritime Organization scraper from the previous exercise. Use one `for` loop and a single CSS selector that covers every member cell across the tables. :::tip Need a nudge? @@ -266,7 +266,7 @@ You may want to check out the following pages:
Solution - {WikipediaCountriesSingleSelectorExercise.code} + {IMOCountriesSingleSelectorExercise.code}
### Scrape F1 news diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs index 777192e3f6..22c611ab17 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs @@ -16,15 +16,13 @@ for (const tableElement of $('.content table').toArray()) { for (const rowElement of rows) { const $cells = $(rowElement).find('td'); + const $firstCell = $cells.eq(0); const firstCellText = $firstCell.text().trim(); if (firstCellText) { console.log(firstCellText); } - } - for (const rowElement of rows) { - const $cells = $(rowElement).find('td'); const $thirdCell = $cells.eq(2); const thirdCellText = $thirdCell.text().trim(); if (thirdCellText) { diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs new file mode 100644 index 0000000000..f682d5b40f --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs @@ -0,0 +1,18 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.content table tr td:nth-child(odd)').toArray()) { + const name = $(element).text().trim(); + if (name) { + console.log(name); + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 608e0f1b82..ceb2e7de78 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -34,15 +34,16 @@ teardown_file() { @test "lists IMO countries" { run node imo_countries.mjs - [[ "$output" == *$'\nLiberia\nLibya\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "lists African countries with a single selector" { - run node wikipedia_countries_single_selector.mjs +@test "lists IMO countries with a single selector" { + run node imo_countries_single_selector.mjs - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index c37a4c56ca..b53ab88a91 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -8,7 +8,7 @@ slug: /scraping-basics-python/locating-elements import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; import IMOCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; -import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.py'; +import IMOCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.py'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py'; **In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.** @@ -219,7 +219,7 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list ### Scrape list of International Maritime Organization members -Download International Maritime Organization's page with the list of members, use Cheerio to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL: +Download International Maritime Organization's page with the list of members, use Beautiful Soup to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL: ```text https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx @@ -229,11 +229,11 @@ Your program should print the following: ```text Albania +Libya Algeria +Lithuania ... Liberia -Libya -... Zimbabwe Faroes Hong Kong, China @@ -252,7 +252,7 @@ Macao, China ### Use CSS selectors to their max -Simplify the code from previous exercise. Use a single for loop and a single CSS selector. +Simplify your International Maritime Organization scraper from the previous exercise. Use one `for` loop and a single CSS selector that covers every member cell across the tables. :::tip Need a nudge? @@ -265,7 +265,7 @@ You may want to check out the following pages:
Solution - {WikipediaCountriesSingleSelectorExercise.code} + {IMOCountriesSingleSelectorExercise.code}
### Scrape F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py index 739e288f6d..438ea51b3b 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py @@ -14,8 +14,6 @@ first_column = cells[0] if text := first_column.text.strip(): print(text) - for row in table.select("tr"): - if cells := row.select("td"): if len(cells) > 2: third_column = cells[2] if text := third_column.text.strip(): diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py new file mode 100644 index 0000000000..168e172c2d --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py @@ -0,0 +1,12 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for cell in soup.select(".content table tr td:nth-child(odd)"): + if name := cell.text.strip(): + print(name) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 07a314548d..77fa3ba391 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -27,15 +27,16 @@ teardown() { @test "lists IMO countries" { run uv run --with=httpx --with=beautifulsoup4 python imo_countries.py - [[ "$output" == *$'\nLiberia\nLibya\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "lists African countries with a single selector" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py +@test "lists IMO countries with a single selector" { + run uv run --with=httpx --with=beautifulsoup4 python imo_countries_single_selector.py - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } From 54699664dc0218c725bdca84a189c2b09ae73f8f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 16:02:48 +0100 Subject: [PATCH 05/19] chore: remove redundant exercise code --- .../wikipedia_countries_single_selector.mjs | 19 ------------------- .../wikipedia_countries_single_selector.py | 13 ------------- 2 files changed, 32 deletions(-) delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs deleted file mode 100644 index 06f54d0686..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs +++ /dev/null @@ -1,19 +0,0 @@ -import * as cheerio from 'cheerio'; - -const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; -const response = await fetch(url); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const element of $('.wikitable tr td:nth-child(3)').toArray()) { - const $nameCell = $(element); - const $link = $nameCell.find('a').first(); - if ($link.length) { - console.log($link.text()); - } -} diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py deleted file mode 100644 index 1fc4a6b268..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py +++ /dev/null @@ -1,13 +0,0 @@ -import httpx -from bs4 import BeautifulSoup - -url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -response = httpx.get(url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for name_cell in soup.select(".wikitable tr td:nth-child(3)"): - link = name_cell.select_one("a") - if link: - print(link.text) From b1d441cedb6f82bc28f1d82df77a5d543c25cc44 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 16:08:37 +0100 Subject: [PATCH 06/19] feat: move links exercise from Wikipedia to UNESCO --- .../09_getting_links.md | 18 ++++++++--------- .../exercises/test.bats | 9 ++++----- .../exercises/unesco_links.mjs | 17 ++++++++++++++++ .../exercises/wikipedia_country_links.mjs | 20 ------------------- 4 files changed, 30 insertions(+), 34 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md index 8670a0536e..17870239a1 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/getting-links import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.mjs'; +import UNESCOLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.mjs'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs'; @@ -324,27 +324,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less -### Scrape links to countries in Africa +### Scrape links to UNESCO members -Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL: +Download UNESCO's page with the list of its members, use Cheerio to parse it, and print links to detail pages of all the members. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.unesco.org/en/countries ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria -https://en.wikipedia.org/wiki/Angola -https://en.wikipedia.org/wiki/Benin -https://en.wikipedia.org/wiki/Botswana +https://www.unesco.org/en/countries/af +https://www.unesco.org/en/countries/al +https://www.unesco.org/en/countries/dz +https://www.unesco.org/en/countries/ad ... ```
Solution - {WikipediaCountryLinksExercise.code} + {UNESCOLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index ceb2e7de78..67ea5d0543 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -86,12 +86,11 @@ teardown_file() { [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } -@test "lists Wikipedia country links" { - run node wikipedia_country_links.mjs +@test "lists UNESCO member links" { + run node unesco_links.mjs - [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]] + [[ $(echo "$output" | wc -l) -gt 50 ]] } @test "lists Guardian F1 article links" { diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs new file mode 100644 index 0000000000..0cbf62e5f4 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs @@ -0,0 +1,17 @@ +import * as cheerio from 'cheerio'; + +const listingUrl = 'https://www.unesco.org/en/countries'; +const response = await fetch(listingUrl); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.node--type-country').toArray()) {; + const $link = $(element).find('a').first(); + const url = new URL($link.attr('href'), listingUrl).href; + console.log(url); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs deleted file mode 100644 index 53c95f8d4d..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs +++ /dev/null @@ -1,20 +0,0 @@ -import * as cheerio from 'cheerio'; - -const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; -const response = await fetch(listingUrl); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const element of $('.wikitable tr td:nth-child(3)').toArray()) { - const $nameCell = $(element); - const $link = $nameCell.find('a').first(); - if ($link.length) { - const url = new URL($link.attr('href'), listingUrl).href; - console.log(url); - } -} From 3296ec9c736775fd2fc761316a5856b734f07c03 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 16:55:45 +0100 Subject: [PATCH 07/19] feat: move links exercise from Wikipedia to UNESCO, Python course --- .../09_getting_links.md | 20 +++++++++---------- .../exercises/test.bats | 9 ++++----- .../exercises/unesco_links.py | 15 ++++++++++++++ .../exercises/wikipedia_country_links.py | 15 -------------- 4 files changed, 29 insertions(+), 30 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index ea5a79a915..94cee8fa0c 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/getting-links import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.py'; +import UNESCOLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.py'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.py'; **In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.** @@ -327,27 +327,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less -### Scrape links to countries in Africa +### Scrape links to UNESCO members -Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL: +Download UNESCO's page with the list of its members, use Beautiful Soup to parse it, and print links to detail pages of all the members. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.unesco.org/en/countries ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria -https://en.wikipedia.org/wiki/Angola -https://en.wikipedia.org/wiki/Benin -https://en.wikipedia.org/wiki/Botswana +https://www.unesco.org/en/countries/af +https://www.unesco.org/en/countries/al +https://www.unesco.org/en/countries/dz +https://www.unesco.org/en/countries/ad ... ```
- Solution - {WikipediaCountryLinksExercise.code} + Solution + {UNESCOLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 77fa3ba391..7716000a88 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -79,12 +79,11 @@ teardown() { [[ "$output" == "{'title': 'Premium Speakers', 'min_price': 75000, 'price': 75000}" ]] } -@test "lists Wikipedia country links" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py +@test "lists UNESCO member links" { + run uv run --with=httpx --with=beautifulsoup4 python unesco_links.py - [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]] + [[ $(echo "$output" | wc -l) -gt 50 ]] } @test "lists Guardian F1 article links" { diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py new file mode 100644 index 0000000000..a2c80b7f5d --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py @@ -0,0 +1,15 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +listing_url = "https://www.unesco.org/en/countries" +response = httpx.get(listing_url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for country in soup.select(".node--type-country"): + link = country.select_one("a") + if link and 'href' in link.attrs: + url = urljoin(listing_url, link["href"]) + print(url) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py deleted file mode 100644 index f435016e45..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py +++ /dev/null @@ -1,15 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - -listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -response = httpx.get(listing_url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for name_cell in soup.select('.wikitable tr td:nth-child(3)'): - link = name_cell.select_one('a') - if link and 'href' in link.attrs: - url = urljoin(listing_url, link['href']) - print(url) From 3bafd42cb4d56dc02829e72cf2ce41d5d0a17d5f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 17:11:16 +0100 Subject: [PATCH 08/19] feat: move away from Wikipedia to UNESCO for a crawling exercise --- .../scraping_basics_python/10_crawling.md | 21 ++++++------ .../exercises/test.bats | 10 +++--- .../exercises/unesco_whs_counts.py | 31 ++++++++++++++++++ .../exercises/wikipedia_calling_codes.py | 32 ------------------- 4 files changed, 45 insertions(+), 49 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 893683792b..54f185e00d 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/crawling import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.py'; +import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.py'; import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py'; **In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.** @@ -183,24 +183,21 @@ In the next lesson, we'll scrape the product detail pages so that each product v -### Scrape calling codes of African countries +### Scrape UNESCO World Heritage Sites -Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: +Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.unesco.org/en/countries ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria +213 -https://en.wikipedia.org/wiki/Angola +244 -https://en.wikipedia.org/wiki/Benin +229 -https://en.wikipedia.org/wiki/Botswana +267 -https://en.wikipedia.org/wiki/Burkina_Faso +226 -https://en.wikipedia.org/wiki/Burundi None -https://en.wikipedia.org/wiki/Cameroon +237 +https://www.unesco.org/en/countries/af 2 +https://www.unesco.org/en/countries/al 4 +https://www.unesco.org/en/countries/dz 7 +https://www.unesco.org/en/countries/ad 1 ... ``` @@ -212,7 +209,7 @@ Locating cells in tables is sometimes easier if you know how to [navigate up](ht
Solution - {WikipediaCallingCodesExercise.code} + {UnescoWhsCountsExercise.code}
### Scrape authors of F1 news articles diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 7716000a88..d50bf41710 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -93,12 +93,12 @@ teardown() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "prints Wikipedia calling codes" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py +@test "prints counts of UNESCO WHS" { + run uv run --with=httpx --with=beautifulsoup4 python unesco_whs_counts.py - [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]] + [[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]] + [[ $(echo "$output" | wc -l) -gt 50 ]] } @test "lists Guardian F1 authors" { diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py new file mode 100644 index 0000000000..53230a8c4a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py @@ -0,0 +1,31 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +def parse_whc_count(soup: BeautifulSoup) -> int: + for card in soup.select(".card-body"): + card_title = card.select_one(".card-title").text + if "World Heritage Sites" in card_title: + return int(card.select_one(".card-number").text.strip()) + return 0 + + +listing_url = "https://www.unesco.org/en/countries" +listing_soup = download(listing_url) + +for country in listing_soup.select(".node--type-country"): + link = country.select_one("a") + if not link or 'href' not in link.attrs: + continue + + country_url = urljoin(listing_url, link["href"]) + country_soup = download(country_url) + whs_count = parse_whc_count(country_soup) + print(country_url, whs_count) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py deleted file mode 100644 index 4d424d6dc1..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py +++ /dev/null @@ -1,32 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - - -def download(url: str) -> BeautifulSoup: - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - -def parse_calling_code(soup: BeautifulSoup) -> str | None: - for label in soup.select('th.infobox-label'): - if label.text.strip() == 'Calling code': - cell = label.parent.select_one('td.infobox-data') - return cell.text.strip() if cell else None - return None - - -listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -listing_soup = download(listing_url) - -for name_cell in listing_soup.select('.wikitable tr td:nth-child(3)'): - link = name_cell.select_one('a') - if not link or 'href' not in link.attrs: - continue - - country_url = urljoin(listing_url, link['href']) - country_soup = download(country_url) - calling_code = parse_calling_code(country_soup) - - print(country_url, calling_code) From 0e714962bf70b69efb8e1a9b19e8db62aabb2084 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 14 Jan 2026 17:28:29 +0100 Subject: [PATCH 09/19] feat: move away from Wikipedia to UNESCO for a crawling exercise, JavaScript --- .../scraping_basics_javascript/10_crawling.md | 21 ++++------ .../exercises/test.bats | 10 ++--- .../exercises/unesco_whs_counts.mjs | 41 +++++++++++++++++++ .../exercises/test.bats | 4 +- 4 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md index 7fb737c293..6480744404 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/crawling import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.mjs'; +import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.mjs'; import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs'; @@ -210,24 +210,21 @@ In the next lesson, we'll scrape the product detail pages so that each product v -### Scrape calling codes of African countries +### Scrape UNESCO World Heritage Sites -Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: +Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.unesco.org/en/countries ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria +213 -https://en.wikipedia.org/wiki/Angola +244 -https://en.wikipedia.org/wiki/Benin +229 -https://en.wikipedia.org/wiki/Botswana +267 -https://en.wikipedia.org/wiki/Burkina_Faso +226 -https://en.wikipedia.org/wiki/Burundi null -https://en.wikipedia.org/wiki/Cameroon +237 +https://www.unesco.org/en/countries/af 2 +https://www.unesco.org/en/countries/al 4 +https://www.unesco.org/en/countries/dz 7 +https://www.unesco.org/en/countries/ad 1 ... ``` @@ -239,7 +236,7 @@ Locating cells in tables is sometimes easier if you know how to [filter](https:/
Solution - {WikipediaCallingCodesExercise.code} + {UnescoWhsCountsExercise.code}
### Scrape authors of F1 news articles diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 67ea5d0543..5841d6fb11 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -90,7 +90,7 @@ teardown_file() { run node unesco_links.mjs [[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]] - [[ $(echo "$output" | wc -l) -gt 50 ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article links" { @@ -100,11 +100,11 @@ teardown_file() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "prints Wikipedia calling codes" { - run node wikipedia_calling_codes.mjs +@test "prints counts of UNESCO WHS" { + run node unesco_whs_counts.mjs - [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] + [[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]] + [[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs new file mode 100644 index 0000000000..5152824d80 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs @@ -0,0 +1,41 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +function parseWhsCount($) { + for (const element of $('.card-body').toArray()) { + const $card = $(element); + const title = $card.find('.card-title').text(); + + if (title.includes('World Heritage Sites')) { + const number = $card.find('.card-number').text().trim(); + return Number.parseInt(number, 10); + } + } + return 0; +} + +const listingUrl = 'https://www.unesco.org/en/countries'; +const $listing = await download(listingUrl); + +for (const element of $listing('.node--type-country').toArray()) { + const $countryCard = $listing(element); + const $link = $countryCard.find('a').first(); + const href = $link.attr('href'); + + if (!href) { + continue; + } + + const countryUrl = new URL(href, listingUrl).href; + const $country = await download(countryUrl); + const whsCount = parseWhsCount($country); + console.log(`${countryUrl} ${whsCount}`); +} diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index d50bf41710..a1da282e21 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -83,7 +83,7 @@ teardown() { run uv run --with=httpx --with=beautifulsoup4 python unesco_links.py [[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]] - [[ $(echo "$output" | wc -l) -gt 50 ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article links" { @@ -98,7 +98,7 @@ teardown() { [[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]] [[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]] - [[ $(echo "$output" | wc -l) -gt 50 ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 authors" { From 79cceba74225021de10dd23d67dda17864cb0194 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 09:46:05 +0100 Subject: [PATCH 10/19] fix: edit exercise text to decribe the new, simplified approach --- .../scraping_basics_javascript/06_locating_elements.md | 6 ++---- .../scraping_basics_python/06_locating_elements.md | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index 201c7cfb59..96f3cbb21c 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -245,15 +245,13 @@ You may want to check out Cheerio's [`.eq()`](https://cheerio.js.org/docs/api/cl Solution {IMOCountriesExercise.code} - We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first one. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. - - Then we visit each row again and check if it contains more than two cells. If yes, we take the text of the third one, and again, we print it if it's not empty. This way we correctly process the large table with its left and right part. + We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows.
### Use CSS selectors to their max -Simplify your International Maritime Organization scraper from the previous exercise. Use one `for` loop and a single CSS selector that covers every member cell across the tables. +Simplify your International Maritime Organization scraper from the previous exercise. Use just one `for` loop with a single CSS selector that targets all relevant table cells. :::tip Need a nudge? diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index b53ab88a91..4599cf5a98 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -244,15 +244,13 @@ Macao, China Solution {IMOCountriesExercise.code} - We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first one. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. - - Then we visit each row again and check if it contains more than two cells. If yes, we take the text of the third one, and again, we print it if it's not empty. This way we correctly process the large table with its left and right part. + We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. ### Use CSS selectors to their max -Simplify your International Maritime Organization scraper from the previous exercise. Use one `for` loop and a single CSS selector that covers every member cell across the tables. +Simplify your International Maritime Organization scraper from the previous exercise. Use just one `for` loop with a single CSS selector that targets all relevant table cells. :::tip Need a nudge? From 33a719539bf936d2f803f563f73ee221dd9bbfd3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 09:54:09 +0100 Subject: [PATCH 11/19] fix: use PascalCase so that we do UnescoWhsCount, not UNESCOWHSCount --- .../scraping_basics_javascript/06_locating_elements.md | 8 ++++---- .../scraping_basics_javascript/09_getting_links.md | 4 ++-- .../scraping_basics_python/06_locating_elements.md | 8 ++++---- .../scraping_basics_python/09_getting_links.md | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index 96f3cbb21c..fbd3fa1de9 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -8,8 +8,8 @@ slug: /scraping-basics-javascript/locating-elements import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import IMOCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; -import IMOCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.mjs'; +import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; +import ImoCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.mjs'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs'; @@ -243,7 +243,7 @@ You may want to check out Cheerio's [`.eq()`](https://cheerio.js.org/docs/api/cl
Solution - {IMOCountriesExercise.code} + {ImoCountriesExercise.code} We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. @@ -264,7 +264,7 @@ You may want to check out the following pages:
Solution - {IMOCountriesSingleSelectorExercise.code} + {ImoCountriesSingleSelectorExercise.code}
### Scrape F1 news diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md index 17870239a1..ef39869521 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/getting-links import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import UNESCOLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.mjs'; +import UnescoLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.mjs'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs'; @@ -344,7 +344,7 @@ https://www.unesco.org/en/countries/ad
Solution - {UNESCOLinksExercise.code} + {UnescoLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 4599cf5a98..4bfd9f9d0a 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -7,8 +7,8 @@ slug: /scraping-basics-python/locating-elements import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import IMOCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; -import IMOCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.py'; +import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; +import ImoCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.py'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py'; **In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.** @@ -242,7 +242,7 @@ Macao, China
Solution - {IMOCountriesExercise.code} + {ImoCountriesExercise.code} We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows. @@ -263,7 +263,7 @@ You may want to check out the following pages:
Solution - {IMOCountriesSingleSelectorExercise.code} + {ImoCountriesSingleSelectorExercise.code}
### Scrape F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index 94cee8fa0c..2d5d0a7a4e 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/getting-links import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import UNESCOLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.py'; +import UnescoLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.py'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.py'; **In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.** @@ -347,7 +347,7 @@ https://www.unesco.org/en/countries/ad
Solution - {UNESCOLinksExercise.code} + {UnescoLinksExercise.code}
### Scrape links to F1 news From a4618ae40f19928711af301d02fa4268b351b322 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 09:55:45 +0100 Subject: [PATCH 12/19] fix: bad extension in an import --- .../webscraping/scraping_basics_python/06_locating_elements.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 4bfd9f9d0a..74d1e1aefb 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/locating-elements import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; +import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.py'; import ImoCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.py'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py'; From c896843b07d4307eaa0c9ab0b3bb879e6b7005a3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 09:57:17 +0100 Subject: [PATCH 13/19] style: make linter happy --- .../scraping_basics_javascript/exercises/unesco_links.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs index 0cbf62e5f4..f3b2e15c2c 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs @@ -10,7 +10,7 @@ if (!response.ok) { const html = await response.text(); const $ = cheerio.load(html); -for (const element of $('.node--type-country').toArray()) {; +for (const element of $('.node--type-country').toArray()) { const $link = $(element).find('a').first(); const url = new URL($link.attr('href'), listingUrl).href; console.log(url); From 1c1767daac451300f81c39617a5783c825938fe4 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 10:10:58 +0100 Subject: [PATCH 14/19] fix: limit UNESCO scraping to 10 countries not to DoS them --- .../scraping_basics_javascript/10_crawling.md | 2 +- .../exercises/unesco_whs_counts.mjs | 9 +++++---- .../webscraping/scraping_basics_python/10_crawling.md | 2 +- .../exercises/unesco_whs_counts.py | 11 ++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md index 6480744404..dc37ca9ca7 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md @@ -212,7 +212,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v ### Scrape UNESCO World Heritage Sites -Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: +Scrape links to detail pages of all UNESCO members. Follow the first 10 links and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: ```text https://www.unesco.org/en/countries diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs index 5152824d80..18a87b3118 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs @@ -11,11 +11,11 @@ async function download(url) { function parseWhsCount($) { for (const element of $('.card-body').toArray()) { - const $card = $(element); - const title = $card.find('.card-title').text(); + const $infoCard = $(element); + const title = $infoCard.find('.card-title').text(); if (title.includes('World Heritage Sites')) { - const number = $card.find('.card-number').text().trim(); + const number = $infoCard.find('.card-number').text().trim(); return Number.parseInt(number, 10); } } @@ -24,8 +24,9 @@ function parseWhsCount($) { const listingUrl = 'https://www.unesco.org/en/countries'; const $listing = await download(listingUrl); +const countries = $listing('.node--type-country').toArray(); -for (const element of $listing('.node--type-country').toArray()) { +for (const element of countries.slice(0, 10)) { const $countryCard = $listing(element); const $link = $countryCard.find('a').first(); const href = $link.attr('href'); diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 54f185e00d..f2dc6936f7 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -185,7 +185,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v ### Scrape UNESCO World Heritage Sites -Scrape links to detail pages of all UNESCO members. Follow each link and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: +Scrape links to detail pages of all UNESCO members. Follow the first 10 links and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: ```text https://www.unesco.org/en/countries diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py index 53230a8c4a..b227507ef3 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py @@ -10,17 +10,18 @@ def download(url: str) -> BeautifulSoup: def parse_whc_count(soup: BeautifulSoup) -> int: - for card in soup.select(".card-body"): - card_title = card.select_one(".card-title").text - if "World Heritage Sites" in card_title: - return int(card.select_one(".card-number").text.strip()) + for info_card in soup.select(".card-body"): + title = info_card.select_one(".card-title").text + if "World Heritage Sites" in title: + return int(info_card.select_one(".card-number").text.strip()) return 0 listing_url = "https://www.unesco.org/en/countries" listing_soup = download(listing_url) +countries = listing_soup.select(".node--type-country") -for country in listing_soup.select(".node--type-country"): +for country in countries[:10]: link = country.select_one("a") if not link or 'href' not in link.attrs: continue From b2dee1b168e3c8f81ae3221703bcef009cd863ed Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 11:44:50 +0100 Subject: [PATCH 15/19] fix: replace UNESCO with WTA, because UNESCO is super unreliable --- .../09_getting_links.md | 18 ++++---- .../scraping_basics_javascript/10_crawling.md | 26 +++++------- .../exercises/test.bats | 19 +++++---- .../exercises/unesco_links.mjs | 17 -------- .../exercises/unesco_whs_counts.mjs | 42 ------------------- .../exercises/wta_tennis_links.mjs | 17 ++++++++ .../exercises/wta_tennis_players.mjs | 30 +++++++++++++ .../09_getting_links.md | 18 ++++---- .../scraping_basics_python/10_crawling.md | 26 +++++------- .../exercises/test.bats | 19 +++++---- .../exercises/unesco_links.py | 15 ------- .../exercises/unesco_whs_counts.py | 32 -------------- .../exercises/wta_tennis_links.py | 13 ++++++ .../exercises/wta_tennis_players.py | 24 +++++++++++ 14 files changed, 142 insertions(+), 174 deletions(-) delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md index ef39869521..7e1a15dfb8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/getting-links import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import UnescoLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.mjs'; +import WtaTennisLinksExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_links.mjs'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs'; @@ -324,27 +324,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less -### Scrape links to UNESCO members +### Scrape links to top tennis players -Download UNESCO's page with the list of its members, use Cheerio to parse it, and print links to detail pages of all the members. Start with this URL: +Download the WTA singles rankings page, use Cheerio to parse it, and print links to the detail pages of the listed players. Start with this URL: ```text -https://www.unesco.org/en/countries +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://www.unesco.org/en/countries/af -https://www.unesco.org/en/countries/al -https://www.unesco.org/en/countries/dz -https://www.unesco.org/en/countries/ad +https://www.wtatennis.com/players/318310/iga-swiatek +https://www.wtatennis.com/players/322341/aryna-sabalenka +https://www.wtatennis.com/players/326911/coco-gauff +https://www.wtatennis.com/players/320203/elena-rybakina ... ```
Solution - {UnescoLinksExercise.code} + {WtaTennisLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md index dc37ca9ca7..fc55568cf2 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/crawling import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.mjs'; +import WtaTennisPlayersExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_players.mjs'; import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs'; @@ -210,33 +210,27 @@ In the next lesson, we'll scrape the product detail pages so that each product v -### Scrape UNESCO World Heritage Sites +### Scrape birthplaces of top 5 tennis players -Scrape links to detail pages of all UNESCO members. Follow the first 10 links and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: +Scrape links to detail pages of the top 5 tennis players according to WTA rankings. Follow the links and extract the birthplace of each player. Print the URL of the player's detail page, then `|` as a separator, then the birthplace. Start with this URL: ```text -https://www.unesco.org/en/countries +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://www.unesco.org/en/countries/af 2 -https://www.unesco.org/en/countries/al 4 -https://www.unesco.org/en/countries/dz 7 -https://www.unesco.org/en/countries/ad 1 -... +https://www.wtatennis.com/players/320760/aryna-sabalenka | Minsk, Belarus +https://www.wtatennis.com/players/326408/iga-swiatek | Warsaw, Poland +https://www.wtatennis.com/players/328560/coco-gauff | Delray Beach, Fl. USA +https://www.wtatennis.com/players/326384/amanda-anisimova | Miami Beach, FL, USA +https://www.wtatennis.com/players/324166/elena-rybakina | Moscow, Russia ``` -:::tip Need a nudge? - -Locating cells in tables is sometimes easier if you know how to [filter](https://cheerio.js.org/docs/api/classes/Cheerio#filter) or [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree. - -::: -
Solution - {UnescoWhsCountsExercise.code} + {WtaTennisPlayersExercise.code}
### Scrape authors of F1 news articles diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 5841d6fb11..b67016c70e 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -86,11 +86,11 @@ teardown_file() { [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } -@test "lists UNESCO member links" { - run node unesco_links.mjs +@test "lists WTA player links" { + run node wta_tennis_links.mjs - [[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ $(echo "$output" | wc -l) -gt 10 ]] } @test "lists Guardian F1 article links" { @@ -100,12 +100,13 @@ teardown_file() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "prints counts of UNESCO WHS" { - run node unesco_whs_counts.mjs +@test "lists WTA player birthplaces" { + run node wta_tennis_players.mjs - [[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]] - [[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ "$output" == *' | '* ]] + [[ "$output" == *', '* ]] + [[ $(echo "$output" | wc -l) -eq 5 ]] } @test "lists Guardian F1 authors" { diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs deleted file mode 100644 index f3b2e15c2c..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_links.mjs +++ /dev/null @@ -1,17 +0,0 @@ -import * as cheerio from 'cheerio'; - -const listingUrl = 'https://www.unesco.org/en/countries'; -const response = await fetch(listingUrl); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const element of $('.node--type-country').toArray()) { - const $link = $(element).find('a').first(); - const url = new URL($link.attr('href'), listingUrl).href; - console.log(url); -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs deleted file mode 100644 index 18a87b3118..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/unesco_whs_counts.mjs +++ /dev/null @@ -1,42 +0,0 @@ -import * as cheerio from 'cheerio'; - -async function download(url) { - const response = await fetch(url); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - const html = await response.text(); - return cheerio.load(html); -} - -function parseWhsCount($) { - for (const element of $('.card-body').toArray()) { - const $infoCard = $(element); - const title = $infoCard.find('.card-title').text(); - - if (title.includes('World Heritage Sites')) { - const number = $infoCard.find('.card-number').text().trim(); - return Number.parseInt(number, 10); - } - } - return 0; -} - -const listingUrl = 'https://www.unesco.org/en/countries'; -const $listing = await download(listingUrl); -const countries = $listing('.node--type-country').toArray(); - -for (const element of countries.slice(0, 10)) { - const $countryCard = $listing(element); - const $link = $countryCard.find('a').first(); - const href = $link.attr('href'); - - if (!href) { - continue; - } - - const countryUrl = new URL(href, listingUrl).href; - const $country = await download(countryUrl); - const whsCount = parseWhsCount($country); - console.log(`${countryUrl} ${whsCount}`); -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs new file mode 100644 index 0000000000..ac075f4cb0 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs @@ -0,0 +1,17 @@ +import * as cheerio from 'cheerio'; + +const listingUrl = 'https://www.wtatennis.com/rankings/singles'; +const response = await fetch(listingUrl); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.rankings__list .player-row-drawer__link').toArray()) { + const playerUrlRelative = $(element).attr('href'); + const playerUrl = new URL(playerUrlRelative, listingUrl).href; + console.log(playerUrl); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs new file mode 100644 index 0000000000..3ed455e2bc --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs @@ -0,0 +1,30 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://www.wtatennis.com/rankings/singles'; +const $listing = await download(listingUrl); +const playerLinks = $listing('.rankings__list .player-row-drawer__link').toArray(); + +for (const element of playerLinks.slice(0, 5)) { + const playerUrlRelative = $listing(element).attr('href'); + const playerUrl = new URL(playerUrlRelative, listingUrl).href; + const $player = await download(playerUrl); + + for (const infoBlock of $player('.profile-bio__info-block').toArray()) { + const $infoBlock = $player(infoBlock); + const label = $infoBlock.find('h2').text().trim().toLowerCase(); + if (label === 'birthplace') { + const birthplace = $infoBlock.find('span').text().trim(); + console.log(`${playerUrl} | ${birthplace}`); + break; + } + } +} diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index 2d5d0a7a4e..5e767dfcd7 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/getting-links import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import UnescoLinksExercise from '!!raw-loader!roa-loader!./exercises/unesco_links.py'; +import WtaTennisLinksExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_links.py'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.py'; **In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.** @@ -327,27 +327,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less -### Scrape links to UNESCO members +### Scrape links to top tennis players -Download UNESCO's page with the list of its members, use Beautiful Soup to parse it, and print links to detail pages of all the members. Start with this URL: +Download the WTA singles rankings page, use Beautiful Soup to parse it, and print links to the detail pages of the listed players. Start with this URL: ```text -https://www.unesco.org/en/countries +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://www.unesco.org/en/countries/af -https://www.unesco.org/en/countries/al -https://www.unesco.org/en/countries/dz -https://www.unesco.org/en/countries/ad +https://www.wtatennis.com/players/318310/iga-swiatek +https://www.wtatennis.com/players/322341/aryna-sabalenka +https://www.wtatennis.com/players/326911/coco-gauff +https://www.wtatennis.com/players/320203/elena-rybakina ... ```
Solution - {UnescoLinksExercise.code} + {WtaTennisLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index f2dc6936f7..f2af6190b9 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/crawling import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import UnescoWhsCountsExercise from '!!raw-loader!roa-loader!./exercises/unesco_whs_counts.py'; +import WtaTennisPlayersExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_players.py'; import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py'; **In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.** @@ -183,33 +183,27 @@ In the next lesson, we'll scrape the product detail pages so that each product v -### Scrape UNESCO World Heritage Sites +### Scrape birthplaces of top 5 tennis players -Scrape links to detail pages of all UNESCO members. Follow the first 10 links and extract the count of the World Heritage Sites. Print the URL and the number for each country. Start with this URL: +Scrape links to detail pages of the top 5 tennis players according to WTA rankings. Follow the links and extract the birthplace of each player. Print the URL of the player's detail page, then `|` as a separator, then the birthplace. Start with this URL: ```text -https://www.unesco.org/en/countries +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://www.unesco.org/en/countries/af 2 -https://www.unesco.org/en/countries/al 4 -https://www.unesco.org/en/countries/dz 7 -https://www.unesco.org/en/countries/ad 1 -... +https://www.wtatennis.com/players/320760/aryna-sabalenka | Minsk, Belarus +https://www.wtatennis.com/players/326408/iga-swiatek | Warsaw, Poland +https://www.wtatennis.com/players/328560/coco-gauff | Delray Beach, Fl. USA +https://www.wtatennis.com/players/326384/amanda-anisimova | Miami Beach, FL, USA +https://www.wtatennis.com/players/324166/elena-rybakina | Moscow, Russia ``` -:::tip Need a nudge? - -Locating cells in tables is sometimes easier if you know how to [navigate up](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#going-up) in the HTML element soup. - -::: -
Solution - {UnescoWhsCountsExercise.code} + {WtaTennisPlayersExercise.code}
### Scrape authors of F1 news articles diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index a1da282e21..be2cf6c3d0 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -79,11 +79,11 @@ teardown() { [[ "$output" == "{'title': 'Premium Speakers', 'min_price': 75000, 'price': 75000}" ]] } -@test "lists UNESCO member links" { - run uv run --with=httpx --with=beautifulsoup4 python unesco_links.py +@test "lists WTA player links" { + run uv run --with=httpx --with=beautifulsoup4 python wta_tennis_links.py - [[ "$output" == *$'https://www.unesco.org/en/countries/af\nhttps://www.unesco.org/en/countries/al\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ $(echo "$output" | wc -l) -gt 10 ]] } @test "lists Guardian F1 article links" { @@ -93,12 +93,13 @@ teardown() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "prints counts of UNESCO WHS" { - run uv run --with=httpx --with=beautifulsoup4 python unesco_whs_counts.py +@test "lists WTA player birthplaces" { + run uv run --with=httpx --with=beautifulsoup4 python wta_tennis_players.py - [[ "$output" == *$'https://www.unesco.org/en/countries/af 2\n'* ]] - [[ "$output" == *$'https://www.unesco.org/en/countries/bs 0\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ "$output" == *' | '* ]] + [[ "$output" == *', '* ]] + [[ $(echo "$output" | wc -l) -eq 5 ]] } @test "lists Guardian F1 authors" { diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py deleted file mode 100644 index a2c80b7f5d..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_links.py +++ /dev/null @@ -1,15 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - -listing_url = "https://www.unesco.org/en/countries" -response = httpx.get(listing_url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for country in soup.select(".node--type-country"): - link = country.select_one("a") - if link and 'href' in link.attrs: - url = urljoin(listing_url, link["href"]) - print(url) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py b/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py deleted file mode 100644 index b227507ef3..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/unesco_whs_counts.py +++ /dev/null @@ -1,32 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - - -def download(url: str) -> BeautifulSoup: - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - -def parse_whc_count(soup: BeautifulSoup) -> int: - for info_card in soup.select(".card-body"): - title = info_card.select_one(".card-title").text - if "World Heritage Sites" in title: - return int(info_card.select_one(".card-number").text.strip()) - return 0 - - -listing_url = "https://www.unesco.org/en/countries" -listing_soup = download(listing_url) -countries = listing_soup.select(".node--type-country") - -for country in countries[:10]: - link = country.select_one("a") - if not link or 'href' not in link.attrs: - continue - - country_url = urljoin(listing_url, link["href"]) - country_soup = download(country_url) - whs_count = parse_whc_count(country_soup) - print(country_url, whs_count) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py new file mode 100644 index 0000000000..e68c809f76 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py @@ -0,0 +1,13 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +listing_url = "https://www.wtatennis.com/rankings/singles" +response = httpx.get(listing_url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for link in soup.select(".rankings__list .player-row-drawer__link"): + player_url = urljoin(listing_url, link["href"]) + print(player_url) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py new file mode 100644 index 0000000000..c98b2a5fa5 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py @@ -0,0 +1,24 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +listing_url = "https://www.wtatennis.com/rankings/singles" +listing_soup = download(listing_url) +player_links = listing_soup.select(".rankings__list .player-row-drawer__link") + +for link in player_links[:5]: + player_url = urljoin(listing_url, link["href"]) + player_soup = download(player_url) + + for info_block in player_soup.select(".profile-bio__info-block"): + label_text = info_block.select_one("h2").text.strip() + if label_text.lower() == "birthplace": + birthplace = info_block.select_one("span").text.strip() + print(player_url, "|", birthplace) From 404760c24e1a9181be22d43eafcb2a2a82ff94c3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 15:02:50 +0100 Subject: [PATCH 16/19] fix: modify Netflix/IMDb exercise so that the tests pass Discovered https://github.com/apify/crawlee-python/issues/1673 when working on this. --- .../12_framework.md | 2 +- .../exercises/crawlee_netflix_ratings.mjs | 5 +-- .../exercises/test.bats | 2 +- .../scraping_basics_python/12_framework.md | 2 +- .../exercises/crawlee_netflix_ratings.py | 8 +++-- .../exercises/test.bats | 36 +++++++++---------- 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index b2e86624b1..8c9a687473 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -429,7 +429,7 @@ If you export the dataset as JSON, it should look something like this: ### Use Crawlee to find the ratings of the most popular Netflix films -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: - URL of the film's IMDb page - Title diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index 19da811bc3..6707280ee8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -15,9 +15,10 @@ const crawler = new CheerioCrawler({ }); } } else if (request.label === 'IMDB_SEARCH') { - await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 }); + await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 }); } else { - const requests = $("[data-uia='top10-table-row-title'] button").toArray().map((buttonElement) => { + const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5); + const requests = buttons.map((buttonElement) => { const name = $(buttonElement).text().trim(); const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index b67016c70e..ba059d1561 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -146,7 +146,7 @@ teardown_file() { (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "10" ]] + [[ $(cat dataset.json | jq '. | length') == "5" ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index afa8736655..01f1095c17 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -470,7 +470,7 @@ If you export the dataset as JSON, it should look something like this: ### Use Crawlee to find the ratings of the most popular Netflix films -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: - URL of the film's IMDb page - Title diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index b7f2000b37..595932596c 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -11,7 +11,8 @@ async def main() -> None: @crawler.router.default_handler async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: requests: list[Request] = [] - for name_cell in context.soup.select('[data-uia="top10-table-row-title"] button'): + name_cells = context.soup.select('[data-uia="top10-table-row-title"] button') + for name_cell in name_cells[:5]: name = name_cell.text.strip() imdb_search_url = ( f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" @@ -21,10 +22,13 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: @crawler.router.handler("IMDB_SEARCH") async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None: - await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1) + await context.enqueue_links( + selector=".ipc-title-link-wrapper", label="IMDB", limit=1 + ) @crawler.router.handler("IMDB") async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: + print(f"Processing IMDB page: {context.request.url}") rating_element = context.soup.select_one( "[data-testid='hero-rating-bar__aggregate-rating__score']" ) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index be2cf6c3d0..20ea6914a5 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -7,25 +7,25 @@ teardown() { } @test "outputs the HTML with Star Wars products" { - run uv run --with=httpx python lego.py + run uv run -q --with=httpx python lego.py [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { - run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py + run uv run -q --with=httpx --with=beautifulsoup4 python f1academy_teams.py [[ "$output" == "6" ]] } @test "counts the number of F1 Academy drivers" { - run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py + run uv run -q --with=httpx --with=beautifulsoup4 python f1academy_drivers.py [[ "$output" == "18" ]] } @test "lists IMO countries" { - run uv run --with=httpx --with=beautifulsoup4 python imo_countries.py + run uv run -q --with=httpx --with=beautifulsoup4 python imo_countries.py [[ "$output" == *$'Albania\nLibya\n'* ]] [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] @@ -33,7 +33,7 @@ teardown() { } @test "lists IMO countries with a single selector" { - run uv run --with=httpx --with=beautifulsoup4 python imo_countries_single_selector.py + run uv run -q --with=httpx --with=beautifulsoup4 python imo_countries_single_selector.py [[ "$output" == *$'Albania\nLibya\n'* ]] [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] @@ -41,14 +41,14 @@ teardown() { } @test "lists Guardian F1 article titles" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py [[ "$output" == *' F1 '* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts" { - run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py + run uv run -q --with=httpx --with=beautifulsoup4 python warehouse_units.py [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] @@ -56,7 +56,7 @@ teardown() { } @test "prints warehouse stock counts using regex" { - run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py + run uv run -q --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] @@ -64,7 +64,7 @@ teardown() { } @test "prints Guardian F1 titles with publish dates" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py [[ "$output" == *' F1 '* ]] [[ "$output" == *' | Mon '* ]] # has info about date, Mondays are very likely @@ -80,21 +80,21 @@ teardown() { } @test "lists WTA player links" { - run uv run --with=httpx --with=beautifulsoup4 python wta_tennis_links.py + run uv run -q --with=httpx --with=beautifulsoup4 python wta_tennis_links.py [[ "$output" == *'https://www.wtatennis.com/players/'* ]] [[ $(echo "$output" | wc -l) -gt 10 ]] } @test "lists Guardian F1 article links" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_links.py [[ "$output" == *'https://www.theguardian.com/sport/'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists WTA player birthplaces" { - run uv run --with=httpx --with=beautifulsoup4 python wta_tennis_players.py + run uv run -q --with=httpx --with=beautifulsoup4 python wta_tennis_players.py [[ "$output" == *'https://www.wtatennis.com/players/'* ]] [[ "$output" == *' | '* ]] @@ -103,7 +103,7 @@ teardown() { } @test "lists Guardian F1 authors" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py [[ "$output" == *' F1 '* ]] [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) @@ -112,7 +112,7 @@ teardown() { } @test "lists Python database jobs" { - run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py + run uv run -q --with=httpx --with=beautifulsoup4 python python_jobs_database.py [[ "$output" == *"'title': '"* ]] [[ "$output" == *"'company': '"* ]] @@ -121,13 +121,13 @@ teardown() { } @test "finds the shortest CNN sports article" { - run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py + run uv run -q --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py [[ "$output" == 'https://edition.cnn.com/'* ]] } @test "scrapes F1 Academy driver details with Crawlee" { - run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py + run uv run -q --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py (( status == 0 )) [[ -f dataset.json ]] @@ -137,11 +137,11 @@ teardown() { } @test "scrapes Netflix ratings with Crawlee" { - run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py + run uv run -q --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "10" ]] + [[ $(cat dataset.json | jq '. | length') -gt 5 ]] # should be -eq 5, but there is a bug https://github.com/apify/crawlee-python/issues/1673 [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } From c01c0a80ad01da78b4375a119070d25c14eac0a3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 15:39:37 +0100 Subject: [PATCH 17/19] fix: don't scrape the npm registry, as it became highly protected --- .../11_scraping_variants.md | 47 +++++++------- .../exercises/js_llm_projects.mjs | 33 ++++++++++ .../exercises/npm_llm_packages.mjs | 61 ------------------- .../exercises/test.bats | 12 +++- 4 files changed, 64 insertions(+), 89 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index 3a85eec446..5c256f17ae 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/scraping-variants import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import NpmLlmPackagesExercise from '!!raw-loader!roa-loader!./exercises/npm_llm_packages.mjs'; +import JsLlmProjectsExercise from '!!raw-loader!roa-loader!./exercises/js_llm_projects.mjs'; import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs'; @@ -347,38 +347,38 @@ Is this the end? Maybe! In the next lesson, we'll use a scraping framework to bu -### Build a scraper for watching npm packages +### Build a scraper for watching JavaScript projects -You can build a scraper now, can't you? Let's build another one! From the registry at [npmjs.com](https://www.npmjs.com/), scrape information about npm packages that match the following criteria: +You can build a scraper now, can't you? Let's build another one! From the [GitHub Topics](https://github.com/topics/) page, scrape information about projects that match the following criteria: -- Have the keyword "LLM" (as in _large language model_) -- Updated within the last two years ("2 years ago" is okay; "3 years ago" is too old) +- Have the topic "LLM" (as in _large language model_) +- Updated within the last month (at most 30 days ago) -Print an array of the top 5 packages with the most dependents. Each package should be represented by an object containing the following data: +Print an array of the top 5 projects with the most stars. Each project should be represented by an object containing the following data: - Name - Description -- URL to the package detail page -- Number of dependents -- Number of downloads +- URL to the repository page +- Number of stars +- Date it was updated on Your output should look something like this: ```js [ { - name: 'langchain', - url: 'https://www.npmjs.com/package/langchain', - description: 'Typescript bindings for langchain', - dependents: 735, - downloads: 3938 + name: 'anything-llm', + url: 'https://github.com/Mintplex-Labs/anything-llm', + description: 'The all-in-one Desktop & Docker AI application with built-in RAG, AI agents, No-code agent builder, MCP compatibility, and more.', + stars: 53358, + updatedOn: "2026-01-15" }, { - name: '@langchain/core', - url: 'https://www.npmjs.com/package/@langchain/core', - description: 'Core LangChain.js abstractions and schemas', - dependents: 730, - downloads: 5994 + name: 'SillyTavern', + url: 'https://github.com/SillyTavern/SillyTavern', + description: 'LLM Frontend for Power Users.', + stars: 22054, + updatedOn: "2026-01-15" }, ... ] @@ -387,14 +387,11 @@ Your output should look something like this:
Solution - After inspecting the registry, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the sorting dropdown results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape the whole registry and then filter by keyword or sort by the number of dependents. + After inspecting the page, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the language and sorting dropdowns results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape whole GitHub Topics and then filter by keyword or sort by the number of stars. - {NpmLlmPackagesExercise.code} - - Since the HTML doesn't contain any descriptive classes, we must rely on its structure. We're using [`.children()`](https://cheerio.js.org/docs/api/classes/Cheerio#children) to carefully navigate the HTML element tree. - - For items older than 2 years, we return `null` instead of an item. Before printing the results, we use [.filter()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter) to remove these empty values and [.splice()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice) the array down to just 5 items. + Both the exact number of stars or the `updatedOn` date can be figured out from hidden attributes of some of the HTML elements, so we can save any additional requests. + {JsLlmProjectsExercise.code}
### Find the shortest CNN article which made it to the Sports homepage diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs new file mode 100644 index 0000000000..0eec016033 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs @@ -0,0 +1,33 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://github.com/topics/llm?l=javascript&s=stars'; +const $ = await download(listingUrl); + +const promises = $('article').toArray().map(async (element) => { + const $card = $(element); + const $link = $card.find('h3 a:nth-child(1)').first(); + + const url = new URL($link.attr('href'), listingUrl).href; + const name = $link.text().trim(); + const description = $card.find('p').text().trim(); + + const starsText = $card.find('#repo-stars-counter-star').first().attr("aria-label"); + const stars = parseInt(starsText.split(' ')[0], 10); + + const updatedAt = $card.find('relative-time').attr('datetime'); + const updatedOn = updatedAt.split('T')[0]; + + return { name, url, description, stars, updatedOn }; +}); + +const data = (await Promise.all(promises)).filter((item) => item); +console.log(data.slice(0, 5)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs deleted file mode 100644 index f52a885057..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs +++ /dev/null @@ -1,61 +0,0 @@ -import * as cheerio from 'cheerio'; - -async function download(url) { - const response = await fetch(url); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - const html = await response.text(); - return cheerio.load(html); -} - -function parseNumber(text) { - return Number.parseInt(text.replace(/[^0-9]/g, ''), 10); -} - -const listingUrl = 'https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count'; -const $ = await download(listingUrl); - -const promises = $('section').toArray().map(async (element) => { - const $card = $(element); - const $link = $card.find('a').first(); - if (!$link.length) { - return null; - } - - const details = $card - .children() - .first() - .children() - .last() - .text() - .split('•') - .map((item) => item.trim()); - - const updatedText = details[2] ?? ''; - const dependentsText = details[3] ?? ''; - const dependents = parseNumber(dependentsText); - - if (updatedText.includes('years ago')) { - const yearsAgo = parseNumber(updatedText); - if (Number.isFinite(yearsAgo) && yearsAgo > 2) { - return null; - } - } - - const name = $link.text().trim(); - const url = new URL($link.attr('href'), listingUrl).href; - const description = $card.find('p').text().trim(); - - const downloadsText = $card - .children() - .last() - .text() - .trim(); - const downloads = parseNumber(downloadsText); - - return { name, url, description, dependents, downloads }; -}); - -const data = (await Promise.all(promises)).filter((item) => item); -console.log(data.slice(0, 5)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index ba059d1561..0deec70fec 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -118,11 +118,17 @@ teardown_file() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "lists npm LLM packages" { - run node npm_llm_packages.mjs +@test "lists JavaScript GitHub repos with the LLM topic" { + run node js_llm_projects.mjs (( status == 0 )) - [[ -n "$output" ]] + [[ $(echo "$output" | wc -l) -eq 37 ]] + [[ "$output" == *' name: '* ]] + [[ "$output" == *' url: '* ]] + [[ "$output" == *'https://github.com/'* ]] + [[ "$output" == *' description: '* ]] + [[ "$output" == *' stars: '* ]] + [[ "$output" == *' updatedOn: '* ]] } @test "finds the shortest CNN sports article" { From d1cc4113eb5daf1a42648e444ba411e9da3c9247 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 15:54:25 +0100 Subject: [PATCH 18/19] style: make linter happy --- .../scraping_basics_javascript/exercises/js_llm_projects.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs index 0eec016033..f74e5b533d 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs @@ -20,7 +20,7 @@ const promises = $('article').toArray().map(async (element) => { const name = $link.text().trim(); const description = $card.find('p').text().trim(); - const starsText = $card.find('#repo-stars-counter-star').first().attr("aria-label"); + const starsText = $card.find('#repo-stars-counter-star').first().attr('aria-label'); const stars = parseInt(starsText.split(' ')[0], 10); const updatedAt = $card.find('relative-time').attr('datetime'); From 37a06efd4bcbd66964aad6f4c0150b5e62b73c22 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 15 Jan 2026 20:38:41 +0100 Subject: [PATCH 19/19] fix: remove leftover line --- .../scraping_basics_python/exercises/crawlee_netflix_ratings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index 595932596c..31449cd7fb 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -28,7 +28,6 @@ async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None: @crawler.router.handler("IMDB") async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: - print(f"Processing IMDB page: {context.request.url}") rating_element = context.soup.select_one( "[data-testid='hero-rating-bar__aggregate-rating__score']" )