From 4f588da4e19a8cbf6eb2d3741a208602c1d2e049 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Fri, 6 Feb 2026 15:42:15 -0500 Subject: [PATCH] chore(learn): modernize instructor-multitask.ipynb to SDK v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update pinecone-client to pinecone==8.0.0 - Pin all dependency versions for reproducibility - Fix grammar: "a efficient" → "an efficient" - Improve clarity and readability Co-Authored-By: Claude Sonnet 4.5 --- .../multitask/instructor-multitask.ipynb | 2790 ++++++++--------- 1 file changed, 1384 insertions(+), 1406 deletions(-) diff --git a/learn/search/multitask/instructor-multitask.ipynb b/learn/search/multitask/instructor-multitask.ipynb index 34d1fbf1..420bf017 100644 --- a/learn/search/multitask/instructor-multitask.ipynb +++ b/learn/search/multitask/instructor-multitask.ipynb @@ -1,1506 +1,1484 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/search/multitask/instructor-multitask.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/search/multitask/instructor-multitask.ipynb)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "z0hK1hvSwOOU" - }, - "source": [ - "# Using **Pinecone** Vector Database with Multitask Embedding Model - [InstructOR](https://huggingface.co/hkunlp/instructor-large)\n", - "\n", - "\n", - "Text embeddings represent discrete text inputs (e.g., sentences, documents, and code) as fixed-sized vectors that can be used in many downstream tasks. These tasks include semantic search, document retrieval for question-answering, prompt retrieval for in-context learning and beyond.\n", - "\n", - "However, most existing embeddings can have *significantly degraded performance when applied to new tasks or domains*. Moreover, existing embeddings usually perform poorly when applied to the same type of task but in different domains such as medicine and finance.\n", - "\n", - "In this notebook, we will demonstrate how text embeddings (even for the same text input) can be adjusted to different downstream applications using **task and domain descriptions**, *without* further task- or domain-specific finetuning using the multitask embedding model - [InstructOR](https://huggingface.co/hkunlp/instructor-large)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "U3yA7yKF4fWX" - }, - "source": [ - "First, we need to install the `InstructorEmbedding` library and other dependencies." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iE2xV8O8qTW-", - "outputId": "6f51c5ab-58c6-4b16-90c2-1cf818d7b798" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[?25l \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m0.0/86.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m\u001b[90m\u257a\u001b[0m\u001b[90m\u2501\u001b[0m \u001b[32m81.9/86.0 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m38.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m51.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m30.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "!pip install -qU \\\n", - " InstructorEmbedding \\\n", - " sentence-transformers \\\n", - " pinecone-client" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "9JXi11oSv27H" - }, - "source": [ - "## Initialization\n", - "\n", - "Now, we can instantiate our InstructOR model using the `InstructorEmbedding` library we downloaded above. We just need to specify the [Hugging Face repository name](https://huggingface.co/hkunlp/instructor-large) for the model." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 554, - "referenced_widgets": [ - "6d98af1a220a4266b5a84135f1957083", - "f0353d2fd5d148c694b6192c5d6f260b", - "c9708a27376b445c9fb3c0058831e9eb", - "c259674de9bd4f4dab2e0309ec9b79f9", - "69c6966c7ec14bdab466b5a7406b7be0", - "b973aac9c2f84c2aaeaec4a32e46a71f", - "6105fa58a6424773b145e4b0cc0185b1", - "7eff7dcd19264e45bd9b6273ee743d98", - "737cb455db9c4197a30a43d7c01f365a", - "f1bf6dafa4d54e58bb4522fb960d0b62", - "eb4ca15dbd36404fbf896e168133d1fd", - "c8a1100570fe4232a5aff816e7fdb261", - "52e943c0d2bb46289a98023d8c889587", - "7a7ec575c0324f72b05085bb72d8a89c", - "38268410e9fc41c68cd288925f36b9b4", - "1157ac25219b43ae86de48db8facd61e", - "c9c27ad06bba46319ab5a82ed618cdb0", - "f4b0aa6a747d48b6986cb0687c98baf8", - "6067cbb9c65f41a69b2d3ffed77893f6", - "6f2eac676301449ca8997afebb15e54c", - "395cfce2b56243d29b0703daddbc240f", - "1280b3b7ba934bd5b1c1efcb5ed0d732", - "7cf1792818ea45fda66e528aef1999fe", - "ee20fd0b70154a1bac74b8b2ff018cfd", - "54d3d038084c4d8b8f58a5c70859e5a5", - "66b206256f8c4ac2b697d13b436f21c4", - "e56b062f3b84443f9fd18eeee078c148", - "a579a4d3a0b04969abf78e8f750e3632", - "c1aa98bc2acf439581aa6ffadba1e07e", - "907ff165ec3745e490f88d90337746b8", - "2ba8f2e77afa40f9b2965855098e9b30", - "5d9458d970524004bc4eec13a0d0d77e", - "508c62a8303841c4861cd77fb79faf99", - "968ffad9b0914be280b6b82bd6e521a8", - "6eed17fcdfc847559e345d1d78b24cf6", - "73c25cca2cc04deabae926f59737e3fd", - "11ca37452bb94297ab8d239dfd67c747", - "6b7fe0c05ef94ec2b30083995d15a7d0", - "9f692036dad146d9b3d025beb3f6dc24", - "a397a37a0fd4408f8f96947b98a98e1e", - "94bfd2817d3147a9a705c9f9bd34de72", - "c748559e39874df8981cb1c35e320903", - "3a391fd55afe400d851f63c4d8a75761", - "a2b82378375c4c69855f1571a5fd493d", - "820b6afda5c8477ca101e1034a2cc988", - "1dceb05680a04531be2ab4c7969d69a1", - "087ffa63f36647b1b8fcaf0f29f153b4", - "75e95815284047caa81847290869bff7", - "a14875c6b7cd4db29b82a850ad0ebd8c", - "340a3686161a413c83da71826c01088b", - "e14dc2e1798f4f2ca7922bfaae8053ee", - "ed539def245c4ae5b88d439bff3dcd03", - "1527bbb16280486899ef91b31028fb20", - "14fcc648c880434dbab66105b9829688", - "bc3b1bac660a4a84bbd98be04fc3c42d", - "00c3647e0cf34dfca86e84d692fa5ab8", - "d96eecb1bed14802b33ce36ae85bb0ad", - "065bb5b0ca06453a9254f95e8c40ace7", - "6005bab96f3e41c7a9ab38a744a01e27", - "0769066cb4ea41438fa97d375d2b5ef8", - "a71277d30acb416facd68b43bfd6174a", - "e53ee48772f84de1bdd99c55ac496151", - "deb111c13b2f417d99889129484eee4e", - "861855d2c6cc4ad4bc9f12524ff8bd9a", - "f07b8fdc6f2649059bc352c37a5c57ae", - "ca31ce95f0bb480ca6e22f1966df2345", - "67b9fce651d64f2facf38df06305a1d6", - "49e21cedd4cb4a829097704cd655636a", - "4b97be9ede8f4205a04993f641a6e7a5", - "eb7c71058543462e87343f816b09b526", - "e9e987bbd0054709b7611e5b36f7103e", - "589365224b234c978309950966bcd25a", - "c1df9318560844d186e4b7d010356998", - "f14ee293c4d74c6e8fdcc674c5bceea3", - "820fb4c07f744b6d9af8865e5a542959", - "6f46b39b48034ccbb783bbb37b9d1f56", - "3608b7d7657e40bb936399ddd336f04a", - "cae36de5cd93463a9333d98cf347d196", - "e76c8052f6664b14a9d47fc49020b6d3", - "e4b09f3303994ab8a44b6f0142dc56fd", - "fd1a39e3659244ef830ed267fcad9749", - "118eabb99a36467ca570cabc4f175d60", - "66a994ab5ef54a4fa179e6376d963025", - "5f75d31677af48679b879c5cacb38c71", - "fb58a7210fc04694855c2f4b25c3af87", - "01acd668391a48cd9dc98737cdb2338e", - "0cb59914cdef44de8f1aeaf0668f7141", - "aa31057895144fa880e43b59c2eb4336", - "6109891b1f244768bbc3fe1f5c90e0a8", - "1a031bd82bb44c1a8c7a7a804b5d6906", - "1928511bd159470294d547346bf88292", - "a89a5cd30afa44b3b27325e60bed2b49", - "145c7d61156b46c4818a7fd7b955c6e8", - "21e41348039c492594b782b47523861c", - "f2cd9d5826244e7189e6d6c8f02e8243", - "140e88c51b034f43a6ce93bb60aac084", - "d8c7cc6216ca4f439b9b6229bbc068e3", - "065b11945db2472c9aca8499247a327b", - "b5f6bc4df83142ad9ddc1bd910c8a4dd", - "aeb2c5098f784841a9feb753c064882e", - "14a9912ac10c4469b4863c346065a9aa", - "28b90a700c6641e8aef090bc6765ac91", - "a97ed66dbdd741609d3de6492bf7855e", - "fb543c87d106451f889af0e92df9415b", - "8a64123f4ce64bcd924dd45b136afd14", - "67b3969a620042f3976140b2f47569d0", - "4e75478da21e4ed7ad1739f08bb0f25d", - "338e9d046e3648ff90fba09e5f202ae3", - "930d86248ce548d2bc4ce00cb42f3a6c", - "6f4b733076be4b3995982c60e6894aca", - "852d2a71b2824a098159c2808fc31131", - "3cda3bf59a644d45b0676f2d01afe9ba", - "50d40541ca974f2eab108bb6a013bec2", - "7660fd9c5e71453eaf6155ae29029bd2", - "26ce0c22ff134c9fbb032d7be1c884a9", - "6e2083ac39b84941afd4c655ea581b0b", - "2dcb9441fa0b4c7eb77f02292525aba1", - "496730b25aef4878a50e80e1db4277c2", - "23632d95a008469a8df6ff58d25200e2", - "b8137e98682f4783ba01d4e8fcf0e75c", - "08d75c5014e748e688b07f0a02ea524d", - "486a284a86f24252a83e7b58eafa4860", - "d781fcce5beb45799435a7e3ca298aa5", - "b8abf6fc87204035859cb7be2b39e291", - "df118f43b54c451d814132faa32d08e8", - "f66356cf00674270a0a8d0842cab034c", - "c41aeabd4d9a4900a0377fbd6add5870", - "5222054044f840dab623ec9dcf82ce03", - "d3dd9b0db0454f2082bbd13432876402", - "8805b986f2b94f3ea37a9cdb9973675d", - "5e1eb78b0e7d4456b7aadcc3bb9d951a", - "85eb31f4bfd64a4093fc2844e09a097c", - "beff2d0033d64249bceed81931571d5a", - "e0b5ac7fbd33465abda4c2ee7909ba18", - "06f594bf701041d0ac4299853e9c2e00", - "7699e96d1d2a4e169c070319b8609009", - "78dc5e6fff4c4cb98c29b497097dbc71", - "495bb8cc76f04d328cd2861bc1bb4e5d", - "bd2aaf520ffc4121baee37bdc0406a4f", - "ea2adec965694a99a6ef05773fa8481a", - "4d07c74f0ccf418f91472d3dceea9a3e", - "b0ac4151da244d4686723922aabcee60", - "d68a382107ac4623a2517221fd3f5696", - "f5bd59df651a4fc6850dea8c84bcda14", - "f84de8afd0864a0dbf23ded6efb8c896", - "afc8a8652d2e4661a6260f7933babbce", - "b95c85cf4be04540b2d11bee10ac1561", - "069c9a634c544985aca65d3c6d32c8b1", - "607a6cec816a47d5bedb8615d01bb15a", - "ad424904aef74ebb874858d5d59a2a93", - "4344a74108dc4e4aae4ae5187bc8461a", - "22a535c7b50d422ba793209eabbacf8d", - "4365f1321ded473e9451c5e7f4e70e0c", - "764b858f5d854914925a60747223c9bb" - ] - }, - "id": "RYZfIHZAqXrz", - "outputId": "ef54134a-cb72-4b7f-fa96-16ec0b24f7de" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/InstructorEmbedding/instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " from tqdm.autonotebook import trange\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6d98af1a220a4266b5a84135f1957083", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (\u2026)c7233/.gitattributes: 0%| | 0.00/1.48k [00:00 \"Represent the [**domain**] [**text_type**] for [**task_objective**]:\"\n", - "\n", - "Here are some examples:\n", - "\n", - "- \"Represent the **Science** **sentence**:\"\n", - "- \"Represent the **Financial** **statement**:\"\n", - "- \"Represent the **Wikipedia** **document** for **retrieval**:\"\n", - "- \"Represent the **Wikipedia** **question** for **retrieving supporting documents**:\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "6kgmbeXhvuNR" - }, - "source": [ - "### First use case - Sentence Similarity Search\n", - "\n", - "Let's see how we can use the model to compute the semantic similary between two groups of sentences, following the customized embedding template." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "YhAtBfqpq1Sp", - "outputId": "9e9edd50-784b-4785-cba7-5a90f06c7f47" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.8799274 0.7748538 ]\n", - " [0.7468935 0.82635736]]\n" - ] - } - ], - "source": [ - "from sklearn.metrics.pairwise import cosine_similarity\n", - "\n", - "sentences_a = [['Represent the Pharmaceutical definition: ', 'Aspirin: Aspirin is a widely-used over-the-counter medication known for its anti-inflammatory and analgesic properties. It is commonly used to relieve pain, reduce fever, and alleviate minor aches and pains.'],\n", - " ['Represent the Artistic definition: ', \"Impressionism: Impressionism is an art movement that emerged in the late 19th century, characterized by the use of short brush strokes and the depiction of light and color to capture the fleeting effects of a scene. It emphasizes the artist's immediate perception and emotional response to the subject.\"]]\n", - "sentences_b = [['Represent the Pharmaceutical definition: ', 'Amoxicillin: Amoxicillin is an antibiotic medication commonly prescribed to treat various bacterial infections, such as respiratory, ear, throat, and urinary tract infections. It belongs to the penicillin class of antibiotics and works by inhibiting bacterial cell wall synthesis.'],\n", - " ['Represent the Artistic definition: ', \"Sculpture: Sculpture is a form of visual art that involves creating three-dimensional objects by carving, modeling, or molding materials such as stone, wood, metal, clay, or other materials. Sculptures can be representational or abstract and are often displayed in galleries, museums, or public spaces.\"]]\n", - "\n", - "embeddings_a = model.encode(sentences_a)\n", - "embeddings_b = model.encode(sentences_b)\n", - "\n", - "similarities = cosine_similarity(embeddings_a,embeddings_b)\n", - "\n", - "print(similarities)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "B1oD0NhnnCZa" - }, - "source": [ - "We can see that the sentences from the same domain have a higher similarity score (*88% and 83%*) compared to sentences from a different domain (*77% and 75%*)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "2PO-Jfu9vpb5" - }, - "source": [ - "### Second use case - Question Answering\n", - "\n", - "Question-answering is very similar to semantic search in that we're comparing text and calculating a similarity metric between them. However, it differs in that we are not looking for the direct semantic similarity between two chunks of text. We're instead looking for the relevance between a question and an answer (or chunk of text they may contain the answer).\n", - "\n", - "Unlike semantic similarity, our question may have a very different semantic meaning to the context (text containing our answer) that we'd like to retrieve. For example, in a pure semantic search the sentences:\n", - "\n", - "```\n", - "\"tell me the name of the capital of France\"\n", - "```\n", - "\n", - "and\n", - "\n", - "```\n", - "\"Paris is the capital and most populous city of France, with an official estimated population of 2.1M residents as of January 2023 in an area of more than 105km^2.\"\n", - "```\n", - "\n", - "Would not be similar, they have very different meanings despite being about the same topic of Paris.\n", - "\n", - "In question-answering, these two sentences would be a perfect match and should have a very high similarity. Let's see how we apply this idea to our InstructOR embeddings.\n", - "\n", - "In the example below, we define a `question` with the corresponding instruction, and then we create a `corpus` of various sentences with some object definitions and their instructions. Afterward, we utilize `cosine_similarity` to find the most similar document (sentence) from the corpus that we can use to answer our question." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wF-HEb4mr_-C", - "outputId": "10c255b3-a6e2-421e-dd54-050b61794150" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "\n", - "question = [['Represent the Wikipedia question for retrieving supporting documents: ',\n", - " 'When were pocket watches popular?']]\n", - "\n", - "corpus = [['Represent the Wikipedia document for retrieval: ',\n", - " 'A canvas painting is artwork created on a canvas surface using various painting techniques and mediums like oil, acrylic, or watercolor. It is popular in traditional and contemporary art, displayed in galleries, museums, and homes.'],\n", - " ['Represent the Wikipedia document for retrieval: ',\n", - " 'A cinema, also known as a movie theater or movie house, is a venue where films are shown to an audience for entertainment. It typically consists of a large screen, seating arrangements, and audio-visual equipment to project and play movies.'],\n", - " ['Represent the Wikipedia document for retrieval: ',\n", - " 'A pocket watch is a small, portable timekeeping device with a clock face and hands, designed to be carried in a pocket or attached to a chain. It is typically made of materials such as metal, gold, or silver and was popular during the 18th and 19th centuries.'],\n", - " ['Represent the Wikipedia document for retrieval: ',\n", - " 'A laptop is a compact and portable computer with a keyboard and screen, ideal for various tasks on the go. It offers versatility for browsing, word processing, multimedia, gaming, and professional work.']]\n", - "\n", - "question_embeddings = model.encode(question)\n", - "corpus_embeddings = model.encode(corpus)\n", - "\n", - "similarities = cosine_similarity(question_embeddings, corpus_embeddings)\n", - "retrieved_doc_id = np.argmax(similarities)\n", - "\n", - "print(retrieved_doc_id)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "1CAOXTr4wvHB" - }, - "source": [ - "We can see that the document containing the most important information is at index 2, which is exactly what we expected." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "eL3VxzRpLudI" - }, - "source": [ - "## Using **Pinecone** as a vector database for storing InstructOR embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we need a place to store these embeddings and enable a efficient vector search through them all. To do that we use Pinecone, we can get a [free API key](https://app.pinecone.io/) and enter it below where we will initialize our connection to Pinecone and create a new index." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pinecone import Pinecone\n", - "\n", - "# initialize connection to pinecone (get API key at app.pinecone.io)\n", - "api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'\n", - "\n", - "# configure client\n", - "pc = Pinecone(api_key=api_key)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "dWmI3lGaDuEk" - }, - "source": [ - "Let's see how we can enrich the previously described use cases with the **Pinecone** vector database, allowing us to store a large amount of embeddings and still quickly receive relevant results.\n", - "

\n", - "Below, we are going to use a small, custom-made corpus of sentences, but in a real use-case, you can utilize datasets of much larger size by following the same syntax." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "LH-MGCpwyv6X" - }, - "source": [ - "### First use case - Semantic Search with Pinecone" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pinecone import ServerlessSpec\n", - "\n", - "cloud = os.environ.get('PINECONE_CLOUD') or 'aws'\n", - "region = os.environ.get('PINECONE_REGION') or 'us-east-1'\n", - "\n", - "spec = ServerlessSpec(cloud=cloud, region=region)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "2PZcGlCGEjlI" - }, - "source": [ - "Here, we are defining the index name that we will use in the initialization process.\n", - "
\n", - "Additionally, the `embeddings_dim` in the cell below is what we need to match with the InstructOR model's embeddings dimension." - ] + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/search/multitask/instructor-multitask.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/search/multitask/instructor-multitask.ipynb)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "z0hK1hvSwOOU" + }, + "source": [ + "# Using **Pinecone** Vector Database with Multitask Embedding Model - [InstructOR](https://huggingface.co/hkunlp/instructor-large)\n", + "\n", + "\n", + "Text embeddings represent discrete text inputs (e.g., sentences, documents, and code) as fixed-sized vectors that can be used in many downstream tasks. These tasks include semantic search, document retrieval for question-answering, prompt retrieval for in-context learning and beyond.\n", + "\n", + "However, most existing embeddings can have *significantly degraded performance when applied to new tasks or domains*. Moreover, existing embeddings usually perform poorly when applied to the same type of task but in different domains such as medicine and finance.\n", + "\n", + "In this notebook, we will demonstrate how text embeddings (even for the same text input) can be adjusted to different downstream applications using **task and domain descriptions**, *without* further task- or domain-specific finetuning using the multitask embedding model - [InstructOR](https://huggingface.co/hkunlp/instructor-large)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "U3yA7yKF4fWX" + }, + "source": [ + "First, we need to install the `InstructorEmbedding` library and other dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "i7PWf-ZQ2luR" - }, - "outputs": [], - "source": [ - "index_name = \"instructor-semantic-search\"" - ] + "id": "iE2xV8O8qTW-", + "outputId": "6f51c5ab-58c6-4b16-90c2-1cf818d7b798" + }, + "outputs": [], + "source": "!pip install -qU \\\n InstructorEmbedding==1.0.1 \\\n sentence-transformers==3.3.1 \\\n pinecone==8.0.0 \\\n torch==2.6.0 \\\n scikit-learn==1.6.1 \\\n numpy==2.2.1" + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "9JXi11oSv27H" + }, + "source": [ + "## Initialization\n", + "\n", + "Now, we can instantiate our InstructOR model using the `InstructorEmbedding` library we downloaded above. We just need to specify the [Hugging Face repository name](https://huggingface.co/hkunlp/instructor-large) for the model." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 554, + "referenced_widgets": [ + "6d98af1a220a4266b5a84135f1957083", + "f0353d2fd5d148c694b6192c5d6f260b", + "c9708a27376b445c9fb3c0058831e9eb", + "c259674de9bd4f4dab2e0309ec9b79f9", + "69c6966c7ec14bdab466b5a7406b7be0", + "b973aac9c2f84c2aaeaec4a32e46a71f", + "6105fa58a6424773b145e4b0cc0185b1", + "7eff7dcd19264e45bd9b6273ee743d98", + "737cb455db9c4197a30a43d7c01f365a", + "f1bf6dafa4d54e58bb4522fb960d0b62", + "eb4ca15dbd36404fbf896e168133d1fd", + "c8a1100570fe4232a5aff816e7fdb261", + "52e943c0d2bb46289a98023d8c889587", + "7a7ec575c0324f72b05085bb72d8a89c", + "38268410e9fc41c68cd288925f36b9b4", + "1157ac25219b43ae86de48db8facd61e", + "c9c27ad06bba46319ab5a82ed618cdb0", + "f4b0aa6a747d48b6986cb0687c98baf8", + "6067cbb9c65f41a69b2d3ffed77893f6", + "6f2eac676301449ca8997afebb15e54c", + "395cfce2b56243d29b0703daddbc240f", + "1280b3b7ba934bd5b1c1efcb5ed0d732", + "7cf1792818ea45fda66e528aef1999fe", + "ee20fd0b70154a1bac74b8b2ff018cfd", + "54d3d038084c4d8b8f58a5c70859e5a5", + "66b206256f8c4ac2b697d13b436f21c4", + "e56b062f3b84443f9fd18eeee078c148", + "a579a4d3a0b04969abf78e8f750e3632", + "c1aa98bc2acf439581aa6ffadba1e07e", + "907ff165ec3745e490f88d90337746b8", + "2ba8f2e77afa40f9b2965855098e9b30", + "5d9458d970524004bc4eec13a0d0d77e", + "508c62a8303841c4861cd77fb79faf99", + "968ffad9b0914be280b6b82bd6e521a8", + "6eed17fcdfc847559e345d1d78b24cf6", + "73c25cca2cc04deabae926f59737e3fd", + "11ca37452bb94297ab8d239dfd67c747", + "6b7fe0c05ef94ec2b30083995d15a7d0", + "9f692036dad146d9b3d025beb3f6dc24", + "a397a37a0fd4408f8f96947b98a98e1e", + "94bfd2817d3147a9a705c9f9bd34de72", + "c748559e39874df8981cb1c35e320903", + "3a391fd55afe400d851f63c4d8a75761", + "a2b82378375c4c69855f1571a5fd493d", + "820b6afda5c8477ca101e1034a2cc988", + "1dceb05680a04531be2ab4c7969d69a1", + "087ffa63f36647b1b8fcaf0f29f153b4", + "75e95815284047caa81847290869bff7", + "a14875c6b7cd4db29b82a850ad0ebd8c", + "340a3686161a413c83da71826c01088b", + "e14dc2e1798f4f2ca7922bfaae8053ee", + "ed539def245c4ae5b88d439bff3dcd03", + "1527bbb16280486899ef91b31028fb20", + "14fcc648c880434dbab66105b9829688", + "bc3b1bac660a4a84bbd98be04fc3c42d", + "00c3647e0cf34dfca86e84d692fa5ab8", + "d96eecb1bed14802b33ce36ae85bb0ad", + "065bb5b0ca06453a9254f95e8c40ace7", + "6005bab96f3e41c7a9ab38a744a01e27", + "0769066cb4ea41438fa97d375d2b5ef8", + "a71277d30acb416facd68b43bfd6174a", + "e53ee48772f84de1bdd99c55ac496151", + "deb111c13b2f417d99889129484eee4e", + "861855d2c6cc4ad4bc9f12524ff8bd9a", + "f07b8fdc6f2649059bc352c37a5c57ae", + "ca31ce95f0bb480ca6e22f1966df2345", + "67b9fce651d64f2facf38df06305a1d6", + "49e21cedd4cb4a829097704cd655636a", + "4b97be9ede8f4205a04993f641a6e7a5", + "eb7c71058543462e87343f816b09b526", + "e9e987bbd0054709b7611e5b36f7103e", + "589365224b234c978309950966bcd25a", + "c1df9318560844d186e4b7d010356998", + "f14ee293c4d74c6e8fdcc674c5bceea3", + "820fb4c07f744b6d9af8865e5a542959", + "6f46b39b48034ccbb783bbb37b9d1f56", + "3608b7d7657e40bb936399ddd336f04a", + "cae36de5cd93463a9333d98cf347d196", + "e76c8052f6664b14a9d47fc49020b6d3", + "e4b09f3303994ab8a44b6f0142dc56fd", + "fd1a39e3659244ef830ed267fcad9749", + "118eabb99a36467ca570cabc4f175d60", + "66a994ab5ef54a4fa179e6376d963025", + "5f75d31677af48679b879c5cacb38c71", + "fb58a7210fc04694855c2f4b25c3af87", + "01acd668391a48cd9dc98737cdb2338e", + "0cb59914cdef44de8f1aeaf0668f7141", + "aa31057895144fa880e43b59c2eb4336", + "6109891b1f244768bbc3fe1f5c90e0a8", + "1a031bd82bb44c1a8c7a7a804b5d6906", + "1928511bd159470294d547346bf88292", + "a89a5cd30afa44b3b27325e60bed2b49", + "145c7d61156b46c4818a7fd7b955c6e8", + "21e41348039c492594b782b47523861c", + "f2cd9d5826244e7189e6d6c8f02e8243", + "140e88c51b034f43a6ce93bb60aac084", + "d8c7cc6216ca4f439b9b6229bbc068e3", + "065b11945db2472c9aca8499247a327b", + "b5f6bc4df83142ad9ddc1bd910c8a4dd", + "aeb2c5098f784841a9feb753c064882e", + "14a9912ac10c4469b4863c346065a9aa", + "28b90a700c6641e8aef090bc6765ac91", + "a97ed66dbdd741609d3de6492bf7855e", + "fb543c87d106451f889af0e92df9415b", + "8a64123f4ce64bcd924dd45b136afd14", + "67b3969a620042f3976140b2f47569d0", + "4e75478da21e4ed7ad1739f08bb0f25d", + "338e9d046e3648ff90fba09e5f202ae3", + "930d86248ce548d2bc4ce00cb42f3a6c", + "6f4b733076be4b3995982c60e6894aca", + "852d2a71b2824a098159c2808fc31131", + "3cda3bf59a644d45b0676f2d01afe9ba", + "50d40541ca974f2eab108bb6a013bec2", + "7660fd9c5e71453eaf6155ae29029bd2", + "26ce0c22ff134c9fbb032d7be1c884a9", + "6e2083ac39b84941afd4c655ea581b0b", + "2dcb9441fa0b4c7eb77f02292525aba1", + "496730b25aef4878a50e80e1db4277c2", + "23632d95a008469a8df6ff58d25200e2", + "b8137e98682f4783ba01d4e8fcf0e75c", + "08d75c5014e748e688b07f0a02ea524d", + "486a284a86f24252a83e7b58eafa4860", + "d781fcce5beb45799435a7e3ca298aa5", + "b8abf6fc87204035859cb7be2b39e291", + "df118f43b54c451d814132faa32d08e8", + "f66356cf00674270a0a8d0842cab034c", + "c41aeabd4d9a4900a0377fbd6add5870", + "5222054044f840dab623ec9dcf82ce03", + "d3dd9b0db0454f2082bbd13432876402", + "8805b986f2b94f3ea37a9cdb9973675d", + "5e1eb78b0e7d4456b7aadcc3bb9d951a", + "85eb31f4bfd64a4093fc2844e09a097c", + "beff2d0033d64249bceed81931571d5a", + "e0b5ac7fbd33465abda4c2ee7909ba18", + "06f594bf701041d0ac4299853e9c2e00", + "7699e96d1d2a4e169c070319b8609009", + "78dc5e6fff4c4cb98c29b497097dbc71", + "495bb8cc76f04d328cd2861bc1bb4e5d", + "bd2aaf520ffc4121baee37bdc0406a4f", + "ea2adec965694a99a6ef05773fa8481a", + "4d07c74f0ccf418f91472d3dceea9a3e", + "b0ac4151da244d4686723922aabcee60", + "d68a382107ac4623a2517221fd3f5696", + "f5bd59df651a4fc6850dea8c84bcda14", + "f84de8afd0864a0dbf23ded6efb8c896", + "afc8a8652d2e4661a6260f7933babbce", + "b95c85cf4be04540b2d11bee10ac1561", + "069c9a634c544985aca65d3c6d32c8b1", + "607a6cec816a47d5bedb8615d01bb15a", + "ad424904aef74ebb874858d5d59a2a93", + "4344a74108dc4e4aae4ae5187bc8461a", + "22a535c7b50d422ba793209eabbacf8d", + "4365f1321ded473e9451c5e7f4e70e0c", + "764b858f5d854914925a60747223c9bb" + ] }, + "id": "RYZfIHZAqXrz", + "outputId": "ef54134a-cb72-4b7f-fa96-16ec0b24f7de" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "# check if index already exists (it shouldn't if this is first time)\n", - "if index_name not in pc.list_indexes().names():\n", - " # if does not exist, create index\n", - " pc.create_index(\n", - " index_name,\n", - " dimension=embeddings_dim,\n", - " metric='cosine',\n", - " spec=spec\n", - " )\n", - " # wait for index to be initialized\n", - " while not pc.describe_index(index_name).status['ready']:\n", - " time.sleep(1)\n", - "\n", - "# connect to index\n", - "index = pc.Index(index_name)\n", - "# view index stats\n", - "index.describe_index_stats()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/InstructorEmbedding/instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import trange\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "v8k2VTenFJoo" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d98af1a220a4266b5a84135f1957083", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "We need to create a corpus of `text_input` and `instruction` pairs, as shown below. Each instruction follows the template we described above (*Represent the ...*), while `text_input` contains the actual content.\n", - "

\n", - "We are adding definitions from different domains so that we can test later on how our similarity search is going to match these examples." + "text/plain": [ + "Downloading (…)c7233/.gitattributes: 0%| | 0.00/1.48k [00:00\n", - "Afterward, we can move on to the query phase." + "text/plain": [ + "Downloading (…)b15c7233/config.json: 0%| | 0.00/1.53k [00:00 \"Represent the [**domain**] [**text_type**] for [**task_objective**]:\"\n", + "\n", + "Here are some examples:\n", + "\n", + "- \"Represent the **Science** **sentence**:\"\n", + "- \"Represent the **Financial** **statement**:\"\n", + "- \"Represent the **Wikipedia** **document** for **retrieval**:\"\n", + "- \"Represent the **Wikipedia** **question** for **retrieving supporting documents**:\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "6kgmbeXhvuNR" + }, + "source": [ + "### First use case - Sentence Similarity Search\n", + "\n", + "Let's see how we can use the model to compute the semantic similary between two groups of sentences, following the customized embedding template." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "YhAtBfqpq1Sp", + "outputId": "9e9edd50-784b-4785-cba7-5a90f06c7f47" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2jt62jqXMkPx", - "outputId": "78beb449-47c0-44b9-ae1a-285948584ac4" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'upserted_count': 4}" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index.upsert(vectors=records)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.8799274 0.7748538 ]\n", + " [0.7468935 0.82635736]]\n" + ] + } + ], + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "sentences_a = [['Represent the Pharmaceutical definition: ', 'Aspirin: Aspirin is a widely-used over-the-counter medication known for its anti-inflammatory and analgesic properties. It is commonly used to relieve pain, reduce fever, and alleviate minor aches and pains.'],\n", + " ['Represent the Artistic definition: ', \"Impressionism: Impressionism is an art movement that emerged in the late 19th century, characterized by the use of short brush strokes and the depiction of light and color to capture the fleeting effects of a scene. It emphasizes the artist's immediate perception and emotional response to the subject.\"]]\n", + "sentences_b = [['Represent the Pharmaceutical definition: ', 'Amoxicillin: Amoxicillin is an antibiotic medication commonly prescribed to treat various bacterial infections, such as respiratory, ear, throat, and urinary tract infections. It belongs to the penicillin class of antibiotics and works by inhibiting bacterial cell wall synthesis.'],\n", + " ['Represent the Artistic definition: ', \"Sculpture: Sculpture is a form of visual art that involves creating three-dimensional objects by carving, modeling, or molding materials such as stone, wood, metal, clay, or other materials. Sculptures can be representational or abstract and are often displayed in galleries, museums, or public spaces.\"]]\n", + "\n", + "embeddings_a = model.encode(sentences_a)\n", + "embeddings_b = model.encode(sentences_b)\n", + "\n", + "similarities = cosine_similarity(embeddings_a,embeddings_b)\n", + "\n", + "print(similarities)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "B1oD0NhnnCZa" + }, + "source": [ + "We can see that the sentences from the same domain have a higher similarity score (*88% and 83%*) compared to sentences from a different domain (*77% and 75%*)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "2PO-Jfu9vpb5" + }, + "source": [ + "### Second use case - Question Answering\n", + "\n", + "Question-answering is very similar to semantic search in that we're comparing text and calculating a similarity metric between them. However, it differs in that we are not looking for the direct semantic similarity between two chunks of text. We're instead looking for the relevance between a question and an answer (or chunk of text they may contain the answer).\n", + "\n", + "Unlike semantic similarity, our question may have a very different semantic meaning to the context (text containing our answer) that we'd like to retrieve. For example, in a pure semantic search the sentences:\n", + "\n", + "```\n", + "\"tell me the name of the capital of France\"\n", + "```\n", + "\n", + "and\n", + "\n", + "```\n", + "\"Paris is the capital and most populous city of France, with an official estimated population of 2.1M residents as of January 2023 in an area of more than 105km^2.\"\n", + "```\n", + "\n", + "Would not be similar, they have very different meanings despite being about the same topic of Paris.\n", + "\n", + "In question-answering, these two sentences would be a perfect match and should have a very high similarity. Let's see how we apply this idea to our InstructOR embeddings.\n", + "\n", + "In the example below, we define a `question` with the corresponding instruction, and then we create a `corpus` of various sentences with some object definitions and their instructions. Afterward, we utilize `cosine_similarity` to find the most similar document (sentence) from the corpus that we can use to answer our question." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "wF-HEb4mr_-C", + "outputId": "10c255b3-a6e2-421e-dd54-050b61794150" + }, + "outputs": [ { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "Y2rWc3i3JXam" - }, - "source": [ - "Let's give it a try with some simple question first." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "question = [['Represent the Wikipedia question for retrieving supporting documents: ',\n", + " 'When were pocket watches popular?']]\n", + "\n", + "corpus = [['Represent the Wikipedia document for retrieval: ',\n", + " 'A canvas painting is artwork created on a canvas surface using various painting techniques and mediums like oil, acrylic, or watercolor. It is popular in traditional and contemporary art, displayed in galleries, museums, and homes.'],\n", + " ['Represent the Wikipedia document for retrieval: ',\n", + " 'A cinema, also known as a movie theater or movie house, is a venue where films are shown to an audience for entertainment. It typically consists of a large screen, seating arrangements, and audio-visual equipment to project and play movies.'],\n", + " ['Represent the Wikipedia document for retrieval: ',\n", + " 'A pocket watch is a small, portable timekeeping device with a clock face and hands, designed to be carried in a pocket or attached to a chain. It is typically made of materials such as metal, gold, or silver and was popular during the 18th and 19th centuries.'],\n", + " ['Represent the Wikipedia document for retrieval: ',\n", + " 'A laptop is a compact and portable computer with a keyboard and screen, ideal for various tasks on the go. It offers versatility for browsing, word processing, multimedia, gaming, and professional work.']]\n", + "\n", + "question_embeddings = model.encode(question)\n", + "corpus_embeddings = model.encode(corpus)\n", + "\n", + "similarities = cosine_similarity(question_embeddings, corpus_embeddings)\n", + "retrieved_doc_id = np.argmax(similarities)\n", + "\n", + "print(retrieved_doc_id)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "1CAOXTr4wvHB" + }, + "source": [ + "We can see that the document containing the most important information is at index 2, which is exactly what we expected." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "eL3VxzRpLudI" + }, + "source": [ + "## Using **Pinecone** as a vector database for storing InstructOR embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Now we need a place to store these embeddings and enable efficient vector search through them all. To do that we use Pinecone. You can get a [free API key](https://app.pinecone.io/) and enter it below where we will initialize our connection to Pinecone and create a new index." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pinecone import Pinecone\n", + "\n", + "# initialize connection to pinecone (get API key at app.pinecone.io)\n", + "api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'\n", + "\n", + "# configure client\n", + "pc = Pinecone(api_key=api_key)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "dWmI3lGaDuEk" + }, + "source": [ + "Let's see how we can enrich the previously described use cases with the **Pinecone** vector database, allowing us to store a large amount of embeddings and still quickly receive relevant results.\n", + "

\n", + "Below, we are going to use a small, custom-made corpus of sentences, but in a real use-case, you can utilize datasets of much larger size by following the same syntax." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "LH-MGCpwyv6X" + }, + "source": [ + "### First use case - Semantic Search with Pinecone" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pinecone import ServerlessSpec\n", + "\n", + "cloud = os.environ.get('PINECONE_CLOUD') or 'aws'\n", + "region = os.environ.get('PINECONE_REGION') or 'us-east-1'\n", + "\n", + "spec = ServerlessSpec(cloud=cloud, region=region)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "2PZcGlCGEjlI" + }, + "source": [ + "Here, we are defining the index name that we will use in the initialization process.\n", + "
\n", + "Additionally, the `embeddings_dim` in the cell below is what we need to match with the InstructOR model's embeddings dimension." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "i7PWf-ZQ2luR" + }, + "outputs": [], + "source": [ + "index_name = \"instructor-semantic-search\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "# check if index already exists (it shouldn't if this is first time)\n", + "if index_name not in pc.list_indexes().names():\n", + " # if does not exist, create index\n", + " pc.create_index(\n", + " index_name,\n", + " dimension=embeddings_dim,\n", + " metric='cosine',\n", + " spec=spec\n", + " )\n", + " # wait for index to be initialized\n", + " while not pc.describe_index(index_name).status['ready']:\n", + " time.sleep(1)\n", + "\n", + "# connect to index\n", + "index = pc.Index(index_name)\n", + "# view index stats\n", + "index.describe_index_stats()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "v8k2VTenFJoo" + }, + "source": [ + "We need to create a corpus of `text_input` and `instruction` pairs, as shown below. Each instruction follows the template we described above (*Represent the ...*), while `text_input` contains the actual content.\n", + "

\n", + "We are adding definitions from different domains so that we can test later on how our similarity search is going to match these examples." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "W98-F1aT2ucq" + }, + "outputs": [], + "source": [ + "corpus = [['Represent the Pharmaceutical definition: ','Aspirin: Aspirin is a widely-used over-the-counter medication known for its anti-inflammatory and analgesic properties. It is commonly used to relieve pain, reduce fever, and alleviate minor aches and pains.'],\n", + " ['Represent the Pharmaceutical definition: ','Amoxicillin: Amoxicillin is an antibiotic medication commonly prescribed to treat various bacterial infections, such as respiratory, ear, throat, and urinary tract infections. It belongs to the penicillin class of antibiotics and works by inhibiting bacterial cell wall synthesis.'],\n", + " ['Represent the Pharmaceutical definition: ','Atorvastatin: Atorvastatin is a lipid-lowering medication used to manage high cholesterol levels and reduce the risk of cardiovascular events. It belongs to the statin class of drugs and works by inhibiting an enzyme involved in cholesterol production in the liver.'],\n", + " ['Represent the Financial definition: ', \"Asset Allocation: Asset allocation is a financial strategy that involves distributing an investment portfolio across various asset classes, such as stocks, bonds, cash, and real estate, to achieve the desired risk-return balance based on an individual's financial goals and risk tolerance.\"],\n", + " ['Represent the Financial definition: ', 'Capital Gains: Capital gains refer to the profits realized from the sale of a capital asset, such as stocks, real estate, or mutual funds, at a price higher than its original purchase price. These gains are subject to capital gains taxes, which vary based on the holding period and tax laws of the country.'],\n", + " ['Represent the Financial definition: ', \"Debt-to-Equity Ratio: The debt-to-equity ratio is a financial metric used to assess a company's financial leverage. It is calculated by dividing the total debt (long-term and short-term liabilities) of a company by its total shareholders' equity. A higher ratio indicates a higher level of debt financing relative to equity, which may signify higher financial risk.\"],\n", + " ['Represent the Artistic definition: ', \"Impressionism: Impressionism is an art movement that emerged in the late 19th century, characterized by the use of short brush strokes and the depiction of light and color to capture the fleeting effects of a scene. It emphasizes the artist's immediate perception and emotional response to the subject.\"],\n", + " ['Represent the Artistic definition: ', \"Sculpture: Sculpture is a form of visual art that involves creating three-dimensional objects by carving, modeling, or molding materials such as stone, wood, metal, clay, or other materials. Sculptures can be representational or abstract and are often displayed in galleries, museums, or public spaces.\"],\n", + " ['Represent the Artistic definition: ', \"Abstract Expressionism: Abstract Expressionism is an art movement that developed in the mid-20th century, characterized by non-representational and spontaneous artworks conveying the artist's emotions and subconscious thoughts. It often features large-scale canvases with bold brushwork and a focus on the artist's gestural movements.\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "44c9872eb5b74ec8a8b298e1a8e2c8a6", + "b6602c9725454eb4b301bfba2a58bcc4", + "9683c34a7b8045ec89c68033806c7acc", + "04371c07c235476b9c00c709b5e5fbd7", + "86638eead77a434597c0ef4cd0b0f592", + "e88b39f22ea0408687fa57e20ecb06e8", + "7c1335ba88d245d3af4e22a49fd962cd", + "61b8ab0e0a6f4bbfbd4d1ed8b37bc8bc", + "a0c3b67b0424491faf81f70576407201", + "b33fd40877e345afa7ad164fa814b456", + "3fdfc1758e7a457cba9fcf2327f69d54" + ] }, + "id": "RbGBAg2C23Dh", + "outputId": "14eb7b8b-5a3c-46aa-8590-88c2e4f3d5a0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FI3VKm-A1rMq", - "outputId": "f9df7abe-c6d6-4f54-d2c6-a860d54d8415" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "44c9872eb5b74ec8a8b298e1a8e2c8a6", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'matches': [{'id': '1',\n", - " 'metadata': {'text': 'A cinema, also known as a movie theater or '\n", - " 'movie house, is a venue where films are '\n", - " 'shown to an audience for entertainment. It '\n", - " 'typically consists of a large screen, '\n", - " 'seating arrangements, and audio-visual '\n", - " 'equipment to project and play movies.'},\n", - " 'score': 0.901540339,\n", - " 'values': []}],\n", - " 'namespace': ''}" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "question = [['Represent the Wikipedia question for retrieving supporting documents: ',\n", - " 'Is there any other name for a cinema?']]\n", - "\n", - "# create the question embedding\n", - "question_embedding = model.encode(question)\n", - "\n", - "# now query\n", - "result = index.query(vector=question_embedding.tolist(), top_k=1, include_metadata=True)\n", - "result" + "text/plain": [ + "Batches: 0%| | 0/3 [00:00\n", + "Afterward, we can move on to the query phase." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "6H2I5W_gwQpT", + "outputId": "c3f21004-f683-4063-8642-2a3f93a99cb7" + }, + "outputs": [ { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "r8M3n_adJdCi" - }, - "source": [ - "Perfect! We have found the document containing the information that answers our question. Now let's check one more." + "data": { + "text/plain": [ + "{'upserted_count': 9}" ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.upsert(vectors=records)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "B4lbIqIQHH2e" + }, + "source": [ + "In the query phase, we are preparing one definition of post-impressionism and then checking which documents in the index are the most semantically similar.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "rb-I63PR27ps", + "outputId": "6d5ff1cf-e830-40c7-c957-6ff75c6b1c55" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xHclzDRUb7pi", - "outputId": "3e6f65f7-c023-462c-daee-f9dc894bb763" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'matches': [{'id': '2',\n", - " 'metadata': {'text': 'A pocket watch is a small, portable '\n", - " 'timekeeping device with a clock face and '\n", - " 'hands, designed to be carried in a pocket '\n", - " 'or attached to a chain. It is typically '\n", - " 'made of materials such as metal, gold, or '\n", - " 'silver and was popular during the 18th and '\n", - " '19th centuries.'},\n", - " 'score': 0.910102367,\n", - " 'values': []}],\n", - " 'namespace': ''}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "question = [['Represent the Wikipedia question for retrieving supporting documents: ',\n", - " 'When were pocket watches popular?']]\n", - "\n", - "# create the question embedding\n", - "question_embedding = model.encode(question)\n", - "\n", - "# now query\n", - "result = index.query(vector=question_embedding.tolist(), top_k=1, include_metadata=True)\n", - "result" + "data": { + "text/plain": [ + "{'matches': [{'id': '6',\n", + " 'metadata': {'text': 'Impressionism: Impressionism is an art '\n", + " 'movement that emerged in the late 19th '\n", + " 'century, characterized by the use of short '\n", + " 'brush strokes and the depiction of light '\n", + " 'and color to capture the fleeting effects '\n", + " \"of a scene. It emphasizes the artist's \"\n", + " 'immediate perception and emotional '\n", + " 'response to the subject.'},\n", + " 'score': 0.927976,\n", + " 'values': []},\n", + " {'id': '8',\n", + " 'metadata': {'text': 'Abstract Expressionism: Abstract '\n", + " 'Expressionism is an art movement that '\n", + " 'developed in the mid-20th century, '\n", + " 'characterized by non-representational and '\n", + " 'spontaneous artworks conveying the '\n", + " \"artist's emotions and subconscious \"\n", + " 'thoughts. It often features large-scale '\n", + " 'canvases with bold brushwork and a focus '\n", + " \"on the artist's gestural movements.\"},\n", + " 'score': 0.927723885,\n", + " 'values': []},\n", + " {'id': '7',\n", + " 'metadata': {'text': 'Sculpture: Sculpture is a form of visual '\n", + " 'art that involves creating '\n", + " 'three-dimensional objects by carving, '\n", + " 'modeling, or molding materials such as '\n", + " 'stone, wood, metal, clay, or other '\n", + " 'materials. Sculptures can be '\n", + " 'representational or abstract and are often '\n", + " 'displayed in galleries, museums, or public '\n", + " 'spaces.'},\n", + " 'score': 0.815320492,\n", + " 'values': []}],\n", + " 'namespace': ''}" ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = [['Represent the Artistic definition: ','Post-Impressionism: Post-Impressionism is an art movement that developed in the late 19th and early 20th centuries as a reaction to Impressionism. Artists associated with Post-Impressionism sought to explore new ways of expressing emotions and ideas through their art. While retaining some aspects of Impressionism, they moved towards more symbolic and abstract representations, emphasizing the use of color, form, and brushwork to convey deeper meaning and subjective experiences.']]\n", + "\n", + "# create the query embedding\n", + "query_embedding = model.encode(query)\n", + "\n", + "# now query\n", + "result = index.query(vector=query_embedding.tolist(), top_k=3, include_metadata=True)\n", + "result" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "sJcByLt0HpgI" + }, + "source": [ + "We received great results! Our most similar definition is the one about impressionism, and the following two are both from the artistic domain. Therefore, we can say that the InstructOR embeddings were created successfully, and Pinecone database produced relevant similarity search results." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "gU2IJVNx2-SZ" + }, + "outputs": [], + "source": [ + "pc.delete_index(index_name)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jJ3IvZh2z41F" + }, + "source": [ + "### Second use case - Question-Answering with Pinecone" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "P-sm9slZIHuu" + }, + "source": [ + "Let's see how InstructOR and Pinecone will behave in our second use-case: question-answering.\n", + "\n", + "\n", + "Again, we are preparing our index by setting the index name, creating embeddings, and moving them into the right format." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "_NfH8uxD_MvG" + }, + "outputs": [], + "source": [ + "index_name = \"instructor-information-retrieval\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "# check if index already exists (it shouldn't if this is first time)\n", + "if index_name not in pc.list_indexes().names():\n", + " # if does not exist, create index\n", + " pc.create_index(\n", + " index_name,\n", + " dimension=embeddings_dim,\n", + " metric='cosine',\n", + " spec=spec\n", + " )\n", + " # wait for index to be initialized\n", + " while not pc.describe_index(index_name).status['ready']:\n", + " time.sleep(1)\n", + "\n", + "# connect to index\n", + "index = pc.Index(index_name)\n", + "# view index stats\n", + "index.describe_index_stats()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "W1K-ufoeIqsg" + }, + "source": [ + "In this use-case, we have a slightly different corpus. Our instructions now specify that we need to use these Wikipedia documents for retrieval. This will assist us in the query phase, where we will provide the question and attempt to find the answer within the documents from the corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "SrLCuapSLCWO" + }, + "outputs": [], + "source": [ + "corpus = [['Represent the Wikipedia document for retrieval: ',\n", + " 'A canvas painting is artwork created on a canvas surface using various painting techniques and mediums like oil, acrylic, or watercolor. It is popular in traditional and contemporary art, displayed in galleries, museums, and homes.'],\n", + " ['Represent the Wikipedia document for retrieval: ',\n", + " 'A cinema, also known as a movie theater or movie house, is a venue where films are shown to an audience for entertainment. It typically consists of a large screen, seating arrangements, and audio-visual equipment to project and play movies.'],\n", + " ['Represent the Wikipedia document for retrieval: ',\n", + " 'A pocket watch is a small, portable timekeeping device with a clock face and hands, designed to be carried in a pocket or attached to a chain. It is typically made of materials such as metal, gold, or silver and was popular during the 18th and 19th centuries.'],\n", + " ['Represent the Wikipedia document for retrieval: ',\n", + " 'A laptop is a compact and portable computer with a keyboard and screen, ideal for various tasks on the go. It offers versatility for browsing, word processing, multimedia, gaming, and professional work.']]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "f53ea94c77894de29ce50f4379025f27", + "a1df8a9c97274b19a417fa99ac79783a", + "e9f3a2f3fe3a42aab5eed542fd2a40f3", + "5b45f6ada2694cc0b5922c497df08bc1", + "b10fbb2f311d47c0954dfa66e62c0001", + "5205854f061845c7bb13b02c461a9813", + "4c04f16940834c9aa41187ee0de4ec50", + "a590be1b2f6148999b0c0cf34ab20f64", + "4574b0e72cca4872a9d7c3322981a0d4", + "1f151734cec24b8f8eed4145dff9f170", + "01c5e43d46314bfab3679087aa93883c" + ] }, + "id": "_QNykpJ7LHFS", + "outputId": "de63879a-d943-4566-987a-7b218331bc2f" + }, + "outputs": [ { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "1duWYbhNKONi" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f53ea94c77894de29ce50f4379025f27", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "Again, we received relevant results." + "text/plain": [ + "Batches: 0%| | 0/1 [00:00