使用LlamaIndex的Chroma多模态演示¶

Chroma 是一个专注于开发者生产力和幸福感的AI原生开源向量数据库。Chroma 采用 Apache 2.0 许可。

Chroma 提供完整的类型提示、经过充分测试并拥有完善的文档。

安装 Chroma

pip install chromadb

Chroma 支持多种运行模式。请参见下文了解每种模式与 LangChain 集成的示例。

内存模式
带持久化的内存模式
在 Docker 容器中

像任何其他数据库一样，你可以

.add
.get
.update
.upsert
.delete
.peek
，而 .query 运行相似性搜索。

请在 docs 查看完整文档。

基本示例¶

在这个基本示例中，我们使用 Paul Graham 的一篇散文，将其分块，使用开源的嵌入模型进行嵌入，然后加载到 Chroma 中，最后进行查询。

如果您在 colab 中打开此 Notebook，可能需要安装 LlamaIndex 🦙。

In [ ]

Copied!

%pip install llama-index-vector-stores-qdrant
%pip install llama-index-embeddings-huggingface
%pip install llama-index-vector-stores-chroma
%pip install llama-index-vector-stores-qdrant %pip install llama-index-embeddings-huggingface %pip install llama-index-vector-stores-chroma

In [ ]

Copied!

!pip install llama-index
!pip install llama-index

创建 Chroma 索引¶

In [ ]

Copied!

!pip install llama-index chromadb --quiet
!pip install chromadb==0.4.17
!pip install sentence-transformers
!pip install pydantic==1.10.11
!pip install open-clip-torch
!pip install llama-index chromadb --quiet !pip install chromadb==0.4.17 !pip install sentence-transformers !pip install pydantic==1.10.11 !pip install open-clip-torch

In [ ]

Copied!





# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
import chromadb
# import from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core import StorageContext from llama_index.embeddings.huggingface import HuggingFaceEmbedding from IPython.display import Markdown, display import chromadb

In [ ]

Copied!

# set up OpenAI
import os
import openai

OPENAI_API_KEY = ""
openai.api_key = OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# set up OpenAI import os import openai OPENAI_API_KEY = "" openai.api_key = OPENAI_API_KEY os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

从维基百科下载图片和文本¶

In [ ]

Copied!





import requests


def get_wikipedia_images(title):
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "imageinfo",
            "iiprop": "url|dimensions|mime",
            "generator": "images",
            "gimlimit": "50",
        },
    ).json()
    image_urls = []
    for page in response["query"]["pages"].values():
        if page["imageinfo"][0]["url"].endswith(".jpg") or page["imageinfo"][
            0
        ]["url"].endswith(".png"):
            image_urls.append(page["imageinfo"][0]["url"])
    return image_urls
import requests def get_wikipedia_images(title): response = requests.get( "https://en.wikipedia.org/w/api.php", params={ "action": "query", "format": "json", "titles": title, "prop": "imageinfo", "iiprop": "url|dimensions|mime", "generator": "images", "gimlimit": "50", }, ).json() image_urls = [] for page in response["query"]["pages"].values(): if page["imageinfo"][0]["url"].endswith(".jpg") or page["imageinfo"][ 0 ]["url"].endswith(".png"): image_urls.append(page["imageinfo"][0]["url"]) return image_urls

In [ ]

Copied!





from pathlib import Path
import urllib.request

image_uuid = 0
MAX_IMAGES_PER_WIKI = 20

wiki_titles = {
    "Tesla Model X",
    "Pablo Picasso",
    "Rivian",
    "The Lord of the Rings",
    "The Matrix",
    "The Simpsons",
}

data_path = Path("mixed_wiki")
if not data_path.exists():
    Path.mkdir(data_path)

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

    images_per_wiki = 0
    try:
        # page_py = wikipedia.page(title)
        list_img_urls = get_wikipedia_images(title)
        # print(list_img_urls)

        for url in list_img_urls:
            if url.endswith(".jpg") or url.endswith(".png"):
                image_uuid += 1
                # image_file_name = title + "_" + url.split("/")[-1]

                urllib.request.urlretrieve(
                    url, data_path / f"{image_uuid}.jpg"
                )
                images_per_wiki += 1
                # Limit the number of images downloaded per wiki page to 15
                if images_per_wiki > MAX_IMAGES_PER_WIKI:
                    break
    except:
        print(str(Exception("No images found for Wikipedia page: ")) + title)
        continue
from pathlib import Path import urllib.request image_uuid = 0 MAX_IMAGES_PER_WIKI = 20 wiki_titles = { "Tesla Model X", "Pablo Picasso", "Rivian", "The Lord of the Rings", "The Matrix", "The Simpsons", } data_path = Path("mixed_wiki") if not data_path.exists(): Path.mkdir(data_path) for title in wiki_titles: response = requests.get( "https://en.wikipedia.org/w/api.php", params={ "action": "query", "format": "json", "titles": title, "prop": "extracts", "explaintext": True, }, ).json() page = next(iter(response["query"]["pages"].values())) wiki_text = page["extract"] with open(data_path / f"{title}.txt", "w") as fp: fp.write(wiki_text) images_per_wiki = 0 try: # page_py = wikipedia.page(title) list_img_urls = get_wikipedia_images(title) # print(list_img_urls) for url in list_img_urls: if url.endswith(".jpg") or url.endswith(".png"): image_uuid += 1 # image_file_name = title + "_" + url.split("/")[-1] urllib.request.urlretrieve( url, data_path / f"{image_uuid}.jpg" ) images_per_wiki += 1 # Limit the number of images downloaded per wiki page to 15 if images_per_wiki > MAX_IMAGES_PER_WIKI: break except: print(str(Exception("未找到维基百科页面图片: ")) + title) continue

设置嵌入模型¶

In [ ]

Copied!

from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction

# set defalut text and image embedding functions
embedding_function = OpenCLIPEmbeddingFunction()
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction # set defalut text and image embedding functions embedding_function = OpenCLIPEmbeddingFunction()

/Users/haotianzhang/llama_index/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

In [ ]

Copied!





from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext
from chromadb.utils.data_loaders import ImageLoader

image_loader = ImageLoader()

# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection(
    "multimodal_collection",
    embedding_function=embedding_function,
    data_loader=image_loader,
)


# load documents
documents = SimpleDirectoryReader("./mixed_wiki/").load_data()

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)
from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.vector_stores.qdrant import QdrantVectorStore from llama_index.core import SimpleDirectoryReader, StorageContext from chromadb.utils.data_loaders import ImageLoader image_loader = ImageLoader() # create client and a new collection chroma_client = chromadb.EphemeralClient() chroma_collection = chroma_client.create_collection( "multimodal_collection", embedding_function=embedding_function, data_loader=image_loader, ) # load documents documents = SimpleDirectoryReader("./mixed_wiki/").load_data() # set up ChromaVectorStore and load in data vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, )

In [ ]

Copied!

retriever = index.as_retriever(similarity_top_k=50)
retrieval_results = retriever.retrieve("Picasso famous paintings")
retriever = index.as_retriever(similarity_top_k=50) retrieval_results = retriever.retrieve("Picasso famous paintings")

In [ ]

Copied!





# print(retrieval_results)
from llama_index.core.schema import ImageNode
from llama_index.core.response.notebook_utils import (
    display_source_node,
    display_image_uris,
)


image_results = []
MAX_RES = 5
cnt = 0
for r in retrieval_results:
    if isinstance(r.node, ImageNode):
        image_results.append(r.node.metadata["file_path"])
    else:
        if cnt < MAX_RES:
            display_source_node(r)
        cnt += 1

display_image_uris(image_results, [3, 3], top_k=2)
# print(retrieval_results) from llama_index.core.schema import ImageNode from llama_index.core.response.notebook_utils import ( display_source_node, display_image_uris, ) image_results = [] MAX_RES = 5 cnt = 0 for r in retrieval_results: if isinstance(r.node, ImageNode): image_results.append(r.node.metadata["file_path"]) else: if cnt < MAX_RES: display_source_node(r) cnt += 1 display_image_uris(image_results, [3, 3], top_k=2)

节点ID： 13adcbba-fe8b-4d51-9139-fb1c55ffc6be
相似度 0.774399292477267
文本： == Artistic legacy == Picasso's influence was and remains immense and widely acknowledged by his ...

节点ID： 4100593e-6b6a-4b5f-8384-98d1c2468204
相似度 0.7695965506408678
文本： === Later works to final years: 1949–1973 === Picasso was one of 250 sculptors who exhibited in t...

节点ID： aeed9d43-f9c5-42a9-a7b9-1a3c005e3745
相似度 0.7693110304140338
文本： Pablo Ruiz Picasso (25 October 1881 – 8 April 1973) was a Spanish painter, sculptor, printmaker, ...

节点ID： 5a6613b6-b599-4e40-92f2-231e10ed54f6
相似度 0.7656537748231977
文本： === The Basel vote === In the 1940s, a Swiss insurance company based in Basel had bought two pain...

节点ID： cc17454c-030d-4f86-a12e-342d0582f4d3
相似度 0.7639671751819532
文本： == Style and technique ==

Picasso was exceptionally prolific throughout his long lifetime. At hi...

No description has been provided for this image

使用LlamaIndex的Chroma多模态演示¶

基本示例¶

创建 Chroma 索引¶

从维基百科下载图片和文本¶

设置嵌入模型¶

使用LlamaIndex构建Chroma多模态索引¶

从多模态索引检索结果¶