diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index d6666fcbd1..fbd3fa1de9 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -8,8 +8,8 @@ slug: /scraping-basics-javascript/locating-elements import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.mjs'; -import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.mjs'; +import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs'; +import ImoCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.mjs'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs'; @@ -212,45 +212,46 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list -### Scrape Wikipedia +### Scrape list of International Maritime Organization members -Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL: +Download International Maritime Organization's page with the list of members, use Cheerio to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx ``` Your program should print the following: ```text +Albania +Libya Algeria -Angola -Benin -Botswana -Burkina Faso -Burundi -Cameroon -Cape Verde -Central African Republic -Chad -Comoros -Democratic Republic of the Congo -Republic of the Congo -Djibouti +Lithuania ... +Liberia +Zimbabwe +Faroes +Hong Kong, China +Macao, China ``` +:::tip Need a nudge? + +You may want to check out Cheerio's [`.eq()`](https://cheerio.js.org/docs/api/classes/Cheerio#eq). + +::: +
Solution - {WikipediaCountriesExercise.code} + {ImoCountriesExercise.code} - Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. + We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows.
### Use CSS selectors to their max -Simplify the code from previous exercise. Use a single for loop and a single CSS selector. +Simplify your International Maritime Organization scraper from the previous exercise. Use just one `for` loop with a single CSS selector that targets all relevant table cells. :::tip Need a nudge? @@ -263,7 +264,7 @@ You may want to check out the following pages:
Solution - {WikipediaCountriesSingleSelectorExercise.code} + {ImoCountriesSingleSelectorExercise.code}
### Scrape F1 news diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md index 8670a0536e..7e1a15dfb8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/getting-links import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.mjs'; +import WtaTennisLinksExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_links.mjs'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs'; @@ -324,27 +324,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less -### Scrape links to countries in Africa +### Scrape links to top tennis players -Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL: +Download the WTA singles rankings page, use Cheerio to parse it, and print links to the detail pages of the listed players. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria -https://en.wikipedia.org/wiki/Angola -https://en.wikipedia.org/wiki/Benin -https://en.wikipedia.org/wiki/Botswana +https://www.wtatennis.com/players/318310/iga-swiatek +https://www.wtatennis.com/players/322341/aryna-sabalenka +https://www.wtatennis.com/players/326911/coco-gauff +https://www.wtatennis.com/players/320203/elena-rybakina ... ```
Solution - {WikipediaCountryLinksExercise.code} + {WtaTennisLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md index 7fb737c293..fc55568cf2 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/crawling import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.mjs'; +import WtaTennisPlayersExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_players.mjs'; import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs'; @@ -210,36 +210,27 @@ In the next lesson, we'll scrape the product detail pages so that each product v -### Scrape calling codes of African countries +### Scrape birthplaces of top 5 tennis players -Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: +Scrape links to detail pages of the top 5 tennis players according to WTA rankings. Follow the links and extract the birthplace of each player. Print the URL of the player's detail page, then `|` as a separator, then the birthplace. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria +213 -https://en.wikipedia.org/wiki/Angola +244 -https://en.wikipedia.org/wiki/Benin +229 -https://en.wikipedia.org/wiki/Botswana +267 -https://en.wikipedia.org/wiki/Burkina_Faso +226 -https://en.wikipedia.org/wiki/Burundi null -https://en.wikipedia.org/wiki/Cameroon +237 -... +https://www.wtatennis.com/players/320760/aryna-sabalenka | Minsk, Belarus +https://www.wtatennis.com/players/326408/iga-swiatek | Warsaw, Poland +https://www.wtatennis.com/players/328560/coco-gauff | Delray Beach, Fl. USA +https://www.wtatennis.com/players/326384/amanda-anisimova | Miami Beach, FL, USA +https://www.wtatennis.com/players/324166/elena-rybakina | Moscow, Russia ``` -:::tip Need a nudge? - -Locating cells in tables is sometimes easier if you know how to [filter](https://cheerio.js.org/docs/api/classes/Cheerio#filter) or [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree. - -::: -
Solution - {WikipediaCallingCodesExercise.code} + {WtaTennisPlayersExercise.code}
### Scrape authors of F1 news articles diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index 3a85eec446..5c256f17ae 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/scraping-variants import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import NpmLlmPackagesExercise from '!!raw-loader!roa-loader!./exercises/npm_llm_packages.mjs'; +import JsLlmProjectsExercise from '!!raw-loader!roa-loader!./exercises/js_llm_projects.mjs'; import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs'; @@ -347,38 +347,38 @@ Is this the end? Maybe! In the next lesson, we'll use a scraping framework to bu -### Build a scraper for watching npm packages +### Build a scraper for watching JavaScript projects -You can build a scraper now, can't you? Let's build another one! From the registry at [npmjs.com](https://www.npmjs.com/), scrape information about npm packages that match the following criteria: +You can build a scraper now, can't you? Let's build another one! From the [GitHub Topics](https://github.com/topics/) page, scrape information about projects that match the following criteria: -- Have the keyword "LLM" (as in _large language model_) -- Updated within the last two years ("2 years ago" is okay; "3 years ago" is too old) +- Have the topic "LLM" (as in _large language model_) +- Updated within the last month (at most 30 days ago) -Print an array of the top 5 packages with the most dependents. Each package should be represented by an object containing the following data: +Print an array of the top 5 projects with the most stars. Each project should be represented by an object containing the following data: - Name - Description -- URL to the package detail page -- Number of dependents -- Number of downloads +- URL to the repository page +- Number of stars +- Date it was updated on Your output should look something like this: ```js [ { - name: 'langchain', - url: 'https://www.npmjs.com/package/langchain', - description: 'Typescript bindings for langchain', - dependents: 735, - downloads: 3938 + name: 'anything-llm', + url: 'https://github.com/Mintplex-Labs/anything-llm', + description: 'The all-in-one Desktop & Docker AI application with built-in RAG, AI agents, No-code agent builder, MCP compatibility, and more.', + stars: 53358, + updatedOn: "2026-01-15" }, { - name: '@langchain/core', - url: 'https://www.npmjs.com/package/@langchain/core', - description: 'Core LangChain.js abstractions and schemas', - dependents: 730, - downloads: 5994 + name: 'SillyTavern', + url: 'https://github.com/SillyTavern/SillyTavern', + description: 'LLM Frontend for Power Users.', + stars: 22054, + updatedOn: "2026-01-15" }, ... ] @@ -387,14 +387,11 @@ Your output should look something like this:
Solution - After inspecting the registry, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the sorting dropdown results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape the whole registry and then filter by keyword or sort by the number of dependents. + After inspecting the page, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the language and sorting dropdowns results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape whole GitHub Topics and then filter by keyword or sort by the number of stars. - {NpmLlmPackagesExercise.code} - - Since the HTML doesn't contain any descriptive classes, we must rely on its structure. We're using [`.children()`](https://cheerio.js.org/docs/api/classes/Cheerio#children) to carefully navigate the HTML element tree. - - For items older than 2 years, we return `null` instead of an item. Before printing the results, we use [.filter()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter) to remove these empty values and [.splice()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice) the array down to just 5 items. + Both the exact number of stars or the `updatedOn` date can be figured out from hidden attributes of some of the HTML elements, so we can save any additional requests. + {JsLlmProjectsExercise.code}
### Find the shortest CNN article which made it to the Sports homepage diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index b2e86624b1..8c9a687473 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -429,7 +429,7 @@ If you export the dataset as JSON, it should look something like this: ### Use Crawlee to find the ratings of the most popular Netflix films -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: - URL of the film's IMDb page - Title diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index 19da811bc3..6707280ee8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -15,9 +15,10 @@ const crawler = new CheerioCrawler({ }); } } else if (request.label === 'IMDB_SEARCH') { - await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 }); + await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 }); } else { - const requests = $("[data-uia='top10-table-row-title'] button").toArray().map((buttonElement) => { + const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5); + const requests = buttons.map((buttonElement) => { const name = $(buttonElement).text().trim(); const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs new file mode 100644 index 0000000000..22c611ab17 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries.mjs @@ -0,0 +1,32 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const tableElement of $('.content table').toArray()) { + const $table = $(tableElement); + const rows = $table.find('tr').toArray(); + + for (const rowElement of rows) { + const $cells = $(rowElement).find('td'); + + const $firstCell = $cells.eq(0); + const firstCellText = $firstCell.text().trim(); + if (firstCellText) { + console.log(firstCellText); + } + + const $thirdCell = $cells.eq(2); + const thirdCellText = $thirdCell.text().trim(); + if (thirdCellText) { + console.log(thirdCellText); + } + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs new file mode 100644 index 0000000000..f682d5b40f --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/imo_countries_single_selector.mjs @@ -0,0 +1,18 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.content table tr td:nth-child(odd)').toArray()) { + const name = $(element).text().trim(); + if (name) { + console.log(name); + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs new file mode 100644 index 0000000000..f74e5b533d --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/js_llm_projects.mjs @@ -0,0 +1,33 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://github.com/topics/llm?l=javascript&s=stars'; +const $ = await download(listingUrl); + +const promises = $('article').toArray().map(async (element) => { + const $card = $(element); + const $link = $card.find('h3 a:nth-child(1)').first(); + + const url = new URL($link.attr('href'), listingUrl).href; + const name = $link.text().trim(); + const description = $card.find('p').text().trim(); + + const starsText = $card.find('#repo-stars-counter-star').first().attr('aria-label'); + const stars = parseInt(starsText.split(' ')[0], 10); + + const updatedAt = $card.find('relative-time').attr('datetime'); + const updatedOn = updatedAt.split('T')[0]; + + return { name, url, description, stars, updatedOn }; +}); + +const data = (await Promise.all(promises)).filter((item) => item); +console.log(data.slice(0, 5)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs deleted file mode 100644 index f52a885057..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs +++ /dev/null @@ -1,61 +0,0 @@ -import * as cheerio from 'cheerio'; - -async function download(url) { - const response = await fetch(url); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - const html = await response.text(); - return cheerio.load(html); -} - -function parseNumber(text) { - return Number.parseInt(text.replace(/[^0-9]/g, ''), 10); -} - -const listingUrl = 'https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count'; -const $ = await download(listingUrl); - -const promises = $('section').toArray().map(async (element) => { - const $card = $(element); - const $link = $card.find('a').first(); - if (!$link.length) { - return null; - } - - const details = $card - .children() - .first() - .children() - .last() - .text() - .split('•') - .map((item) => item.trim()); - - const updatedText = details[2] ?? ''; - const dependentsText = details[3] ?? ''; - const dependents = parseNumber(dependentsText); - - if (updatedText.includes('years ago')) { - const yearsAgo = parseNumber(updatedText); - if (Number.isFinite(yearsAgo) && yearsAgo > 2) { - return null; - } - } - - const name = $link.text().trim(); - const url = new URL($link.attr('href'), listingUrl).href; - const description = $card.find('p').text().trim(); - - const downloadsText = $card - .children() - .last() - .text() - .trim(); - const downloads = parseNumber(downloadsText); - - return { name, url, description, dependents, downloads }; -}); - -const data = (await Promise.all(promises)).filter((item) => item); -console.log(data.slice(0, 5)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 618b64cd14..0deec70fec 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -31,17 +31,19 @@ teardown_file() { [[ "$output" == "18" ]] } -@test "lists African countries" { - run node wikipedia_countries.mjs +@test "lists IMO countries" { + run node imo_countries.mjs - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "lists African countries with a single selector" { - run node wikipedia_countries_single_selector.mjs +@test "lists IMO countries with a single selector" { + run node imo_countries_single_selector.mjs - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -56,7 +58,7 @@ teardown_file() { run node warehouse_units.mjs [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -64,7 +66,7 @@ teardown_file() { run node warehouse_units_regex.mjs [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -72,7 +74,7 @@ teardown_file() { run node guardian_publish_dates.mjs [[ "$output" == *' F1 '* ]] - [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely + [[ "$output" == *' | Mon '* ]] # has info about date, Mondays are very likely [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -84,12 +86,11 @@ teardown_file() { [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } -@test "lists Wikipedia country links" { - run node wikipedia_country_links.mjs +@test "lists WTA player links" { + run node wta_tennis_links.mjs - [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ $(echo "$output" | wc -l) -gt 10 ]] } @test "lists Guardian F1 article links" { @@ -99,12 +100,13 @@ teardown_file() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "prints Wikipedia calling codes" { - run node wikipedia_calling_codes.mjs +@test "lists WTA player birthplaces" { + run node wta_tennis_players.mjs - [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ "$output" == *' | '* ]] + [[ "$output" == *', '* ]] + [[ $(echo "$output" | wc -l) -eq 5 ]] } @test "lists Guardian F1 authors" { @@ -116,11 +118,17 @@ teardown_file() { [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "lists npm LLM packages" { - run node npm_llm_packages.mjs +@test "lists JavaScript GitHub repos with the LLM topic" { + run node js_llm_projects.mjs (( status == 0 )) - [[ -n "$output" ]] + [[ $(echo "$output" | wc -l) -eq 37 ]] + [[ "$output" == *' name: '* ]] + [[ "$output" == *' url: '* ]] + [[ "$output" == *'https://github.com/'* ]] + [[ "$output" == *' description: '* ]] + [[ "$output" == *' stars: '* ]] + [[ "$output" == *' updatedOn: '* ]] } @test "finds the shortest CNN sports article" { @@ -134,7 +142,7 @@ teardown_file() { (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "18" ]] + [[ $(cat dataset.json | jq '. | length') -gt 6 ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } @@ -144,7 +152,7 @@ teardown_file() { (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "10" ]] + [[ $(cat dataset.json | jq '. | length') == "5" ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs deleted file mode 100644 index fd9a7f2fb9..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs +++ /dev/null @@ -1,29 +0,0 @@ -import * as cheerio from 'cheerio'; - -const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; -const response = await fetch(url); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const tableElement of $('.wikitable').toArray()) { - const $table = $(tableElement); - const rows = $table.find('tr'); - - for (const rowElement of rows.toArray()) { - const $row = $(rowElement); - const cells = $row.find('td'); - - if (cells.length > 0) { - const $thirdColumn = $(cells[2]); - const $link = $thirdColumn.find('a').first(); - if ($link.length) { - console.log($link.text()); - } - } - } -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs deleted file mode 100644 index 06f54d0686..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs +++ /dev/null @@ -1,19 +0,0 @@ -import * as cheerio from 'cheerio'; - -const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; -const response = await fetch(url); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const element of $('.wikitable tr td:nth-child(3)').toArray()) { - const $nameCell = $(element); - const $link = $nameCell.find('a').first(); - if ($link.length) { - console.log($link.text()); - } -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs deleted file mode 100644 index 53c95f8d4d..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs +++ /dev/null @@ -1,20 +0,0 @@ -import * as cheerio from 'cheerio'; - -const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; -const response = await fetch(listingUrl); - -if (!response.ok) { - throw new Error(`HTTP ${response.status}`); -} - -const html = await response.text(); -const $ = cheerio.load(html); - -for (const element of $('.wikitable tr td:nth-child(3)').toArray()) { - const $nameCell = $(element); - const $link = $nameCell.find('a').first(); - if ($link.length) { - const url = new URL($link.attr('href'), listingUrl).href; - console.log(url); - } -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs new file mode 100644 index 0000000000..ac075f4cb0 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_links.mjs @@ -0,0 +1,17 @@ +import * as cheerio from 'cheerio'; + +const listingUrl = 'https://www.wtatennis.com/rankings/singles'; +const response = await fetch(listingUrl); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.rankings__list .player-row-drawer__link').toArray()) { + const playerUrlRelative = $(element).attr('href'); + const playerUrl = new URL(playerUrlRelative, listingUrl).href; + console.log(playerUrl); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs new file mode 100644 index 0000000000..3ed455e2bc --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wta_tennis_players.mjs @@ -0,0 +1,30 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://www.wtatennis.com/rankings/singles'; +const $listing = await download(listingUrl); +const playerLinks = $listing('.rankings__list .player-row-drawer__link').toArray(); + +for (const element of playerLinks.slice(0, 5)) { + const playerUrlRelative = $listing(element).attr('href'); + const playerUrl = new URL(playerUrlRelative, listingUrl).href; + const $player = await download(playerUrl); + + for (const infoBlock of $player('.profile-bio__info-block').toArray()) { + const $infoBlock = $player(infoBlock); + const label = $infoBlock.find('h2').text().trim().toLowerCase(); + if (label === 'birthplace') { + const birthplace = $infoBlock.find('span').text().trim(); + console.log(`${playerUrl} | ${birthplace}`); + break; + } + } +} diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 5dce9bc0e6..74d1e1aefb 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -7,8 +7,8 @@ slug: /scraping-basics-python/locating-elements import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.py'; -import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.py'; +import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.py'; +import ImoCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.py'; import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py'; **In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.** @@ -217,45 +217,40 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list -### Scrape Wikipedia +### Scrape list of International Maritime Organization members -Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL: +Download International Maritime Organization's page with the list of members, use Beautiful Soup to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx ``` Your program should print the following: ```text +Albania +Libya Algeria -Angola -Benin -Botswana -Burkina Faso -Burundi -Cameroon -Cape Verde -Central African Republic -Chad -Comoros -Democratic Republic of the Congo -Republic of the Congo -Djibouti +Lithuania ... +Liberia +Zimbabwe +Faroes +Hong Kong, China +Macao, China ```
Solution - {WikipediaCountriesExercise.code} + {ImoCountriesExercise.code} - Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. + We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows.
### Use CSS selectors to their max -Simplify the code from previous exercise. Use a single for loop and a single CSS selector. +Simplify your International Maritime Organization scraper from the previous exercise. Use just one `for` loop with a single CSS selector that targets all relevant table cells. :::tip Need a nudge? @@ -268,7 +263,7 @@ You may want to check out the following pages:
Solution - {WikipediaCountriesSingleSelectorExercise.code} + {ImoCountriesSingleSelectorExercise.code}
### Scrape F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index ea5a79a915..5e767dfcd7 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/getting-links import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.py'; +import WtaTennisLinksExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_links.py'; import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.py'; **In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.** @@ -327,27 +327,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less -### Scrape links to countries in Africa +### Scrape links to top tennis players -Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL: +Download the WTA singles rankings page, use Beautiful Soup to parse it, and print links to the detail pages of the listed players. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria -https://en.wikipedia.org/wiki/Angola -https://en.wikipedia.org/wiki/Benin -https://en.wikipedia.org/wiki/Botswana +https://www.wtatennis.com/players/318310/iga-swiatek +https://www.wtatennis.com/players/322341/aryna-sabalenka +https://www.wtatennis.com/players/326911/coco-gauff +https://www.wtatennis.com/players/320203/elena-rybakina ... ```
- Solution - {WikipediaCountryLinksExercise.code} + Solution + {WtaTennisLinksExercise.code}
### Scrape links to F1 news diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 893683792b..f2af6190b9 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/crawling import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.py'; +import WtaTennisPlayersExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_players.py'; import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py'; **In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.** @@ -183,36 +183,27 @@ In the next lesson, we'll scrape the product detail pages so that each product v -### Scrape calling codes of African countries +### Scrape birthplaces of top 5 tennis players -Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: +Scrape links to detail pages of the top 5 tennis players according to WTA rankings. Follow the links and extract the birthplace of each player. Print the URL of the player's detail page, then `|` as a separator, then the birthplace. Start with this URL: ```text -https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa +https://www.wtatennis.com/rankings/singles ``` Your program should print the following: ```text -https://en.wikipedia.org/wiki/Algeria +213 -https://en.wikipedia.org/wiki/Angola +244 -https://en.wikipedia.org/wiki/Benin +229 -https://en.wikipedia.org/wiki/Botswana +267 -https://en.wikipedia.org/wiki/Burkina_Faso +226 -https://en.wikipedia.org/wiki/Burundi None -https://en.wikipedia.org/wiki/Cameroon +237 -... +https://www.wtatennis.com/players/320760/aryna-sabalenka | Minsk, Belarus +https://www.wtatennis.com/players/326408/iga-swiatek | Warsaw, Poland +https://www.wtatennis.com/players/328560/coco-gauff | Delray Beach, Fl. USA +https://www.wtatennis.com/players/326384/amanda-anisimova | Miami Beach, FL, USA +https://www.wtatennis.com/players/324166/elena-rybakina | Moscow, Russia ``` -:::tip Need a nudge? - -Locating cells in tables is sometimes easier if you know how to [navigate up](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#going-up) in the HTML element soup. - -::: -
Solution - {WikipediaCallingCodesExercise.code} + {WtaTennisPlayersExercise.code}
### Scrape authors of F1 news articles diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index afa8736655..01f1095c17 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -470,7 +470,7 @@ If you export the dataset as JSON, it should look something like this: ### Use Crawlee to find the ratings of the most popular Netflix films -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: - URL of the film's IMDb page - Title diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index b7f2000b37..31449cd7fb 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -11,7 +11,8 @@ async def main() -> None: @crawler.router.default_handler async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: requests: list[Request] = [] - for name_cell in context.soup.select('[data-uia="top10-table-row-title"] button'): + name_cells = context.soup.select('[data-uia="top10-table-row-title"] button') + for name_cell in name_cells[:5]: name = name_cell.text.strip() imdb_search_url = ( f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" @@ -21,7 +22,9 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: @crawler.router.handler("IMDB_SEARCH") async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None: - await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1) + await context.enqueue_links( + selector=".ipc-title-link-wrapper", label="IMDB", limit=1 + ) @crawler.router.handler("IMDB") async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py new file mode 100644 index 0000000000..438ea51b3b --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries.py @@ -0,0 +1,20 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx" + +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for table in soup.select(".content table"): + for row in table.select("tr"): + if cells := row.select("td"): + first_column = cells[0] + if text := first_column.text.strip(): + print(text) + if len(cells) > 2: + third_column = cells[2] + if text := third_column.text.strip(): + print(text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py new file mode 100644 index 0000000000..168e172c2d --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/imo_countries_single_selector.py @@ -0,0 +1,12 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for cell in soup.select(".content table tr td:nth-child(odd)"): + if name := cell.text.strip(): + print(name) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py index 934d347b8a..9854aa7b71 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py @@ -5,5 +5,5 @@ products = json.load(file) for product in products: - if int(product['min_price']) > 500: + if int(product["min_price"]) > 50000: pp(product) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 2de3db35f3..20ea6914a5 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -7,65 +7,67 @@ teardown() { } @test "outputs the HTML with Star Wars products" { - run uv run --with=httpx python lego.py + run uv run -q --with=httpx python lego.py [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { - run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py + run uv run -q --with=httpx --with=beautifulsoup4 python f1academy_teams.py [[ "$output" == "6" ]] } @test "counts the number of F1 Academy drivers" { - run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py + run uv run -q --with=httpx --with=beautifulsoup4 python f1academy_drivers.py [[ "$output" == "18" ]] } -@test "lists African countries" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py +@test "lists IMO countries" { + run uv run -q --with=httpx --with=beautifulsoup4 python imo_countries.py - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "lists African countries with a single selector" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py +@test "lists IMO countries with a single selector" { + run uv run -q --with=httpx --with=beautifulsoup4 python imo_countries_single_selector.py - [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ "$output" == *$'Albania\nLibya\n'* ]] + [[ "$output" == *$'\nZimbabwe\nFaroes\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article titles" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py [[ "$output" == *' F1 '* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts" { - run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py + run uv run -q --with=httpx --with=beautifulsoup4 python warehouse_units.py [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts using regex" { - run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py + run uv run -q --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] - [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 76\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Guardian F1 titles with publish dates" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py [[ "$output" == *' F1 '* ]] - [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely + [[ "$output" == *' | Mon '* ]] # has info about date, Mondays are very likely [[ $(echo "$output" | wc -l) -gt 5 ]] } @@ -77,31 +79,31 @@ teardown() { [[ "$output" == "{'title': 'Premium Speakers', 'min_price': 75000, 'price': 75000}" ]] } -@test "lists Wikipedia country links" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py +@test "lists WTA player links" { + run uv run -q --with=httpx --with=beautifulsoup4 python wta_tennis_links.py - [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ $(echo "$output" | wc -l) -gt 10 ]] } @test "lists Guardian F1 article links" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_links.py [[ "$output" == *'https://www.theguardian.com/sport/'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } -@test "prints Wikipedia calling codes" { - run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py +@test "lists WTA player birthplaces" { + run uv run -q --with=httpx --with=beautifulsoup4 python wta_tennis_players.py - [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] - [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + [[ "$output" == *'https://www.wtatennis.com/players/'* ]] + [[ "$output" == *' | '* ]] + [[ "$output" == *', '* ]] + [[ $(echo "$output" | wc -l) -eq 5 ]] } @test "lists Guardian F1 authors" { - run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py + run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py [[ "$output" == *' F1 '* ]] [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) @@ -110,7 +112,7 @@ teardown() { } @test "lists Python database jobs" { - run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py + run uv run -q --with=httpx --with=beautifulsoup4 python python_jobs_database.py [[ "$output" == *"'title': '"* ]] [[ "$output" == *"'company': '"* ]] @@ -119,27 +121,27 @@ teardown() { } @test "finds the shortest CNN sports article" { - run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py + run uv run -q --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py [[ "$output" == 'https://edition.cnn.com/'* ]] } @test "scrapes F1 Academy driver details with Crawlee" { - run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py + run uv run -q --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "18" ]] + [[ $(cat dataset.json | jq '. | length') -gt 6 ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } @test "scrapes Netflix ratings with Crawlee" { - run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py + run uv run -q --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py (( status == 0 )) [[ -f dataset.json ]] - [[ $(cat dataset.json | jq '. | length') == "10" ]] + [[ $(cat dataset.json | jq '. | length') -gt 5 ]] # should be -eq 5, but there is a bug https://github.com/apify/crawlee-python/issues/1673 [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py deleted file mode 100644 index 4d424d6dc1..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py +++ /dev/null @@ -1,32 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - - -def download(url: str) -> BeautifulSoup: - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - -def parse_calling_code(soup: BeautifulSoup) -> str | None: - for label in soup.select('th.infobox-label'): - if label.text.strip() == 'Calling code': - cell = label.parent.select_one('td.infobox-data') - return cell.text.strip() if cell else None - return None - - -listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -listing_soup = download(listing_url) - -for name_cell in listing_soup.select('.wikitable tr td:nth-child(3)'): - link = name_cell.select_one('a') - if not link or 'href' not in link.attrs: - continue - - country_url = urljoin(listing_url, link['href']) - country_soup = download(country_url) - calling_code = parse_calling_code(country_soup) - - print(country_url, calling_code) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py deleted file mode 100644 index 0d4769ccbb..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py +++ /dev/null @@ -1,17 +0,0 @@ -import httpx -from bs4 import BeautifulSoup - -url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -response = httpx.get(url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for table in soup.select(".wikitable"): - for row in table.select("tr"): - cells = row.select("td") - if cells: - third_column = cells[2] - link = third_column.select_one("a") - if link: - print(link.text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py deleted file mode 100644 index 1fc4a6b268..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py +++ /dev/null @@ -1,13 +0,0 @@ -import httpx -from bs4 import BeautifulSoup - -url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -response = httpx.get(url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for name_cell in soup.select(".wikitable tr td:nth-child(3)"): - link = name_cell.select_one("a") - if link: - print(link.text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py deleted file mode 100644 index f435016e45..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py +++ /dev/null @@ -1,15 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - -listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" -response = httpx.get(listing_url) -response.raise_for_status() - -soup = BeautifulSoup(response.text, "html.parser") - -for name_cell in soup.select('.wikitable tr td:nth-child(3)'): - link = name_cell.select_one('a') - if link and 'href' in link.attrs: - url = urljoin(listing_url, link['href']) - print(url) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py new file mode 100644 index 0000000000..e68c809f76 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_links.py @@ -0,0 +1,13 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +listing_url = "https://www.wtatennis.com/rankings/singles" +response = httpx.get(listing_url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for link in soup.select(".rankings__list .player-row-drawer__link"): + player_url = urljoin(listing_url, link["href"]) + print(player_url) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py new file mode 100644 index 0000000000..c98b2a5fa5 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wta_tennis_players.py @@ -0,0 +1,24 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +listing_url = "https://www.wtatennis.com/rankings/singles" +listing_soup = download(listing_url) +player_links = listing_soup.select(".rankings__list .player-row-drawer__link") + +for link in player_links[:5]: + player_url = urljoin(listing_url, link["href"]) + player_soup = download(player_url) + + for info_block in player_soup.select(".profile-bio__info-block"): + label_text = info_block.select_one("h2").text.strip() + if label_text.lower() == "birthplace": + birthplace = info_block.select_one("span").text.strip() + print(player_url, "|", birthplace)