apify · honzajavorek · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
@@ -8,8 +8,8 @@ slug: /scraping-basics-javascript/locating-elements
 import CodeBlock from '@theme/CodeBlock';
 import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
 import Exercises from '../scraping_basics/_exercises.mdx';
-import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.mjs';
-import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.mjs';
+import ImoCountriesExercise from '!!raw-loader!roa-loader!./exercises/imo_countries.mjs';
+import ImoCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/imo_countries_single_selector.mjs';
 import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs';
 
 <LegacyJsCourseAdmonition />
@@ -212,45 +212,46 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list
 
 <Exercises />
 
-### Scrape Wikipedia
+### Scrape list of International Maritime Organization members
 
-Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL:
+Download International Maritime Organization's page with the list of members, use Cheerio to parse it, and print names of all the members mentioned in all tables (including Associate Members). This is the URL:
 
 ```text
-https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx
 ```
 
 Your program should print the following:
 
 ```text
+Albania
+Libya
 Algeria
-Angola
-Benin
-Botswana
-Burkina Faso
-Burundi
-Cameroon
-Cape Verde
-Central African Republic
-Chad
-Comoros
-Democratic Republic of the Congo
-Republic of the Congo
-Djibouti
+Lithuania
 ...
+Liberia
+Zimbabwe
+Faroes
+Hong Kong, China
+Macao, China
 ```
 
+:::tip Need a nudge?
+
+You may want to check out Cheerio's [`.eq()`](https://cheerio.js.org/docs/api/classes/Cheerio#eq).
+
+:::
+
 <details>
   <summary>Solution</summary>
-  <CodeBlock language="js">{WikipediaCountriesExercise.code}</CodeBlock>
+  <CodeBlock language="js">{ImoCountriesExercise.code}</CodeBlock>
 
-  Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells.
+  We visit each row and if we find some [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells, we take the text of the first and third ones. We print it if it's not empty. This approach skips [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th) and empty rows.
 
 </details>
 
 ### Use CSS selectors to their max
 
-Simplify the code from previous exercise. Use a single for loop and a single CSS selector.
+Simplify your International Maritime Organization scraper from the previous exercise. Use just one `for` loop with a single CSS selector that targets all relevant table cells.
 
 :::tip Need a nudge?
 
@@ -263,7 +264,7 @@ You may want to check out the following pages:
 
 <details>
   <summary>Solution</summary>
-  <CodeBlock language="js">{WikipediaCountriesSingleSelectorExercise.code}</CodeBlock>
+  <CodeBlock language="js">{ImoCountriesSingleSelectorExercise.code}</CodeBlock>
 </details>
 
 ### Scrape F1 news

@@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/getting-links
 import CodeBlock from '@theme/CodeBlock';
 import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
 import Exercises from '../scraping_basics/_exercises.mdx';
-import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.mjs';
+import WtaTennisLinksExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_links.mjs';
 import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs';
 
 <LegacyJsCourseAdmonition />
@@ -324,27 +324,27 @@ Ta-da! We've managed to get links leading to the product pages. In the next less
 
 <Exercises />
 
-### Scrape links to countries in Africa
+### Scrape links to top tennis players
 
-Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL:
+Download the WTA singles rankings page, use Cheerio to parse it, and print links to the detail pages of the listed players. Start with this URL:
 
 ```text
-https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+https://www.wtatennis.com/rankings/singles
 ```
 
 Your program should print the following:
 
 ```text
-https://en.wikipedia.org/wiki/Algeria
-https://en.wikipedia.org/wiki/Angola
-https://en.wikipedia.org/wiki/Benin
-https://en.wikipedia.org/wiki/Botswana
+https://www.wtatennis.com/players/318310/iga-swiatek
+https://www.wtatennis.com/players/322341/aryna-sabalenka
+https://www.wtatennis.com/players/326911/coco-gauff
+https://www.wtatennis.com/players/320203/elena-rybakina
 ...
 ```
 
 <details>
   <summary>Solution</summary>
-  <CodeBlock language="js">{WikipediaCountryLinksExercise.code}</CodeBlock>
+  <CodeBlock language="js">{WtaTennisLinksExercise.code}</CodeBlock>
 </details>
 
 ### Scrape links to F1 news

@@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/crawling
 import CodeBlock from '@theme/CodeBlock';
 import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
 import Exercises from '../scraping_basics/_exercises.mdx';
-import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.mjs';
+import WtaTennisPlayersExercise from '!!raw-loader!roa-loader!./exercises/wta_tennis_players.mjs';
 import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs';
 
 <LegacyJsCourseAdmonition />
@@ -210,36 +210,27 @@ In the next lesson, we'll scrape the product detail pages so that each product v
 
 <Exercises />
 
-### Scrape calling codes of African countries
+### Scrape birthplaces of top 5 tennis players
 
-Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
+Scrape links to detail pages of the top 5 tennis players according to WTA rankings. Follow the links and extract the birthplace of each player. Print the URL of the player's detail page, then `|` as a separator, then the birthplace. Start with this URL:
 
 ```text
-https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+https://www.wtatennis.com/rankings/singles
 ```
 
 Your program should print the following:
 
 ```text
-https://en.wikipedia.org/wiki/Algeria +213
-https://en.wikipedia.org/wiki/Angola +244
-https://en.wikipedia.org/wiki/Benin +229
-https://en.wikipedia.org/wiki/Botswana +267
-https://en.wikipedia.org/wiki/Burkina_Faso +226
-https://en.wikipedia.org/wiki/Burundi null
-https://en.wikipedia.org/wiki/Cameroon +237
-...
+https://www.wtatennis.com/players/320760/aryna-sabalenka | Minsk, Belarus
+https://www.wtatennis.com/players/326408/iga-swiatek | Warsaw, Poland
+https://www.wtatennis.com/players/328560/coco-gauff | Delray Beach, Fl. USA
+https://www.wtatennis.com/players/326384/amanda-anisimova | Miami Beach, FL, USA
+https://www.wtatennis.com/players/324166/elena-rybakina | Moscow, Russia
 ```
 
-:::tip Need a nudge?
-
-Locating cells in tables is sometimes easier if you know how to [filter](https://cheerio.js.org/docs/api/classes/Cheerio#filter) or [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
-
-:::
-
 <details>
   <summary>Solution</summary>
-  <CodeBlock language="js">{WikipediaCallingCodesExercise.code}</CodeBlock>
+  <CodeBlock language="js">{WtaTennisPlayersExercise.code}</CodeBlock>
 </details>
 
 ### Scrape authors of F1 news articles

@@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/scraping-variants
 import CodeBlock from '@theme/CodeBlock';
 import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
 import Exercises from '../scraping_basics/_exercises.mdx';
-import NpmLlmPackagesExercise from '!!raw-loader!roa-loader!./exercises/npm_llm_packages.mjs';
+import JsLlmProjectsExercise from '!!raw-loader!roa-loader!./exercises/js_llm_projects.mjs';
 import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs';
 
 <LegacyJsCourseAdmonition />
@@ -347,38 +347,38 @@ Is this the end? Maybe! In the next lesson, we'll use a scraping framework to bu
 
 <Exercises />
 
-### Build a scraper for watching npm packages
+### Build a scraper for watching JavaScript projects
 
-You can build a scraper now, can't you? Let's build another one! From the registry at [npmjs.com](https://www.npmjs.com/), scrape information about npm packages that match the following criteria:
+You can build a scraper now, can't you? Let's build another one! From the [GitHub Topics](https://github.com/topics/) page, scrape information about projects that match the following criteria:
 
-- Have the keyword "LLM" (as in _large language model_)
-- Updated within the last two years ("2 years ago" is okay; "3 years ago" is too old)
+- Have the topic "LLM" (as in _large language model_)
+- Updated within the last month (at most 30 days ago)
 
-Print an array of the top 5 packages with the most dependents. Each package should be represented by an object containing the following data:
+Print an array of the top 5 projects with the most stars. Each project should be represented by an object containing the following data:
 
 - Name
 - Description
-- URL to the package detail page
-- Number of dependents
-- Number of downloads
+- URL to the repository page
+- Number of stars
+- Date it was updated on
 
 Your output should look something like this:
 
 ```js
 [
   {
-    name: 'langchain',
-    url: 'https://www.npmjs.com/package/langchain',
-    description: 'Typescript bindings for langchain',
-    dependents: 735,
-    downloads: 3938
+    name: 'anything-llm',
+    url: 'https://github.com/Mintplex-Labs/anything-llm',
+    description: 'The all-in-one Desktop & Docker AI application with built-in RAG, AI agents, No-code agent builder, MCP compatibility, and more.',
+    stars: 53358,
+    updatedOn: "2026-01-15"
   },
   {
-    name: '@langchain/core',
-    url: 'https://www.npmjs.com/package/@langchain/core',
-    description: 'Core LangChain.js abstractions and schemas',
-    dependents: 730,
-    downloads: 5994
+    name: 'SillyTavern',
+    url: 'https://github.com/SillyTavern/SillyTavern',
+    description: 'LLM Frontend for Power Users.',
+    stars: 22054,
+    updatedOn: "2026-01-15"
   },
   ...
 ]
@@ -387,14 +387,11 @@ Your output should look something like this:
 <details>
   <summary>Solution</summary>
 
-  After inspecting the registry, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the sorting dropdown results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape the whole registry and then filter by keyword or sort by the number of dependents.
+  After inspecting the page, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the language and sorting dropdowns results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape whole GitHub Topics and then filter by keyword or sort by the number of stars.
 
-  <CodeBlock language="js">{NpmLlmPackagesExercise.code}</CodeBlock>
-
-  Since the HTML doesn't contain any descriptive classes, we must rely on its structure. We're using [`.children()`](https://cheerio.js.org/docs/api/classes/Cheerio#children) to carefully navigate the HTML element tree.
-
-  For items older than 2 years, we return `null` instead of an item. Before printing the results, we use [.filter()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter) to remove these empty values and [.splice()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice) the array down to just 5 items.
+  Both the exact number of stars or the `updatedOn` date can be figured out from hidden attributes of some of the HTML elements, so we can save any additional requests.
 
+  <CodeBlock language="js">{JsLlmProjectsExercise.code}</CodeBlock>
 </details>
 
 ### Find the shortest CNN article which made it to the Sports homepage

@@ -429,7 +429,7 @@ If you export the dataset as JSON, it should look something like this:
 
 ### Use Crawlee to find the ratings of the most popular Netflix films
 
-The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
+The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
 
 - URL of the film's IMDb page
 - Title

@@ -15,9 +15,10 @@ const crawler = new CheerioCrawler({
         });
       }
     } else if (request.label === 'IMDB_SEARCH') {
-      await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 });
+      await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 });
     } else {
-      const requests = $("[data-uia='top10-table-row-title'] button").toArray().map((buttonElement) => {
+      const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5);
+      const requests = buttons.map((buttonElement) => {
         const name = $(buttonElement).text().trim();
         const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
         return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });

@@ -0,0 +1,32 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx';
+const response = await fetch(url);
+
+if (!response.ok) {
+  throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const tableElement of $('.content table').toArray()) {
+  const $table = $(tableElement);
+  const rows = $table.find('tr').toArray();
+
+  for (const rowElement of rows) {
+    const $cells = $(rowElement).find('td');
+
+    const $firstCell = $cells.eq(0);
+    const firstCellText = $firstCell.text().trim();
+    if (firstCellText) {
+      console.log(firstCellText);
+    }
+
+    const $thirdCell = $cells.eq(2);
+    const thirdCellText = $thirdCell.text().trim();
+    if (thirdCellText) {
+      console.log(thirdCellText);
+    }
+  }
+}
@@ -0,0 +1,18 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://www.imo.org/en/ourwork/ero/pages/memberstates.aspx';
+const response = await fetch(url);
+
+if (!response.ok) {
+  throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('.content table tr td:nth-child(odd)').toArray()) {
+  const name = $(element).text().trim();
+  if (name) {
+    console.log(name);
+  }
+}
@@ -0,0 +1,33 @@
+import * as cheerio from 'cheerio';
+
+async function download(url) {
+  const response = await fetch(url);
+  if (!response.ok) {
+    throw new Error(`HTTP ${response.status}`);
+  }
+  const html = await response.text();
+  return cheerio.load(html);
+}
+
+const listingUrl = 'https://github.com/topics/llm?l=javascript&s=stars';
+const $ = await download(listingUrl);
+
+const promises = $('article').toArray().map(async (element) => {
+  const $card = $(element);
+  const $link = $card.find('h3 a:nth-child(1)').first();
+
+  const url = new URL($link.attr('href'), listingUrl).href;
+  const name = $link.text().trim();
+  const description = $card.find('p').text().trim();
+
+  const starsText = $card.find('#repo-stars-counter-star').first().attr('aria-label');
+  const stars = parseInt(starsText.split(' ')[0], 10);
+
+  const updatedAt = $card.find('relative-time').attr('datetime');
+  const updatedOn = updatedAt.split('T')[0];
+
+  return { name, url, description, stars, updatedOn };
+});
+
+const data = (await Promise.all(promises)).filter((item) => item);
+console.log(data.slice(0, 5));