Bagel Network¶

Bagel 是一个用于 AI 的开放推理数据。它专为分布式机器学习计算而构建，可将 AI 数据基础设施成本降低十倍。

使用以下命令安装 Bagel

pip install bagelML

像任何其他数据库一样，你可以

.add
.get
.delete
.update
.upsert
.peek
.modify
和 .find 运行相似性搜索。

基本示例¶

在这个基本示例中，我们获取 Paul Graham 的一篇文章，将其分割成块，使用开源嵌入模型对其进行嵌入，将其加载到 Bagel 中，然后对其进行查询。

输入 [ ]

已复制！

%pip install llama-index-vector-stores-bagel
%pip install llama-index-embeddings-huggingface
%pip install bagelML
%pip install llama-index-vector-stores-bagel %pip install llama-index-embeddings-huggingface %pip install bagelML

输入 [ ]

已复制！





# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.bagel import BagelVectorStore
from llama_index.core import StorageContext
from IPython.display import Markdown, display
import bagel
from bagel import Settings
# 从 llama_index.core 导入 VectorStoreIndex, SimpleDirectoryReader 从 llama_index.vector_stores.bagel 导入 BagelVectorStore 从 llama_index.core 导入 StorageContext 从 IPython.display 导入 Markdown, display 导入 bagel 从 bagel 导入 Settings

输入 [ ]

已复制！

# set up OpenAI
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]
# 设置 OpenAI 导入 os 导入 getpass os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:") 导入 openai openai.api_key = os.environ["OPENAI_API_KEY"]

下载数据

输入 [ ]

已复制！

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

输入 [ ]

已复制！





# create server settings
server_settings = Settings(
    bagel_api_impl="rest", bagel_server_host="api.bageldb.ai"
)

# create client
client = bagel.Client(server_settings)

# create collection
collection = client.get_or_create_cluster(
    "testing_embeddings", embedding_model="custom", dimension=384
)

# define embedding function
embed_model = "local:BAAI/bge-small-en-v1.5"

# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

# set up BagelVectorStore and load in data
vector_store = BagelVectorStore(collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(f"<b>{response}</b>")
# 创建服务器设置 server_settings = Settings( bagel_api_impl="rest", bagel_server_host="api.bageldb.ai" ) # 创建客户端 client = bagel.Client(server_settings) # 创建集合 collection = client.get_or_create_cluster( "testing_embeddings", embedding_model="custom", dimension=384 ) # 定义嵌入函数 embed_model = "local:BAAI/bge-small-en-v1.5" # 加载文档 documents = SimpleDirectoryReader("./data/paul_graham/").load_data() # 设置 BagelVectorStore 并加载数据 vector_store = BagelVectorStore(collection=collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, embed_model=embed_model ) query_engine = index.as_query_engine() response = query_engine.query("What did the author do growing up?") print(f"{response}")

创建 - 添加 - 获取¶

输入 [ ]

已复制！





def create_add_get(client):
    """
    Create, add, and get
    """
    name = "testing"

    # Get or create a cluster
    cluster = client.get_or_create_cluster(name)

    # Add documents to the cluster
    resp = cluster.add(
        documents=[
            "This is document1",
            "This is bidhan",
        ],
        metadatas=[{"source": "google"}, {"source": "notion"}],
        ids=[str(uuid.uuid4()), str(uuid.uuid4())],
    )

    # Print count
    print("count of docs:", cluster.count())

    # Get the first item
    first_item = cluster.peek(1)
    if first_item:
        print("get 1st item")

    print(">> create_add_get done !\n")
def create_add_get(client): """ 创建、添加和获取 """ name = "testing" # 获取或创建集群 cluster = client.get_or_create_cluster(name) # 向集群添加文档 resp = cluster.add( documents=[ "This is document1", "This is bidhan", ], metadatas=[{"source": "google"}, {"source": "notion"}], ids=[str(uuid.uuid4()), str(uuid.uuid4())], ) # 打印数量 print("count of docs:", cluster.count()) # 获取第一个条目 first_item = cluster.peek(1) if first_item: print("get 1st item") print(">> create_add_get done !\n")

创建 - 添加 - 按文本查找¶

输入 [ ]

已复制！





def create_add_find(client):
    """
    Create, add, & find

    Parameters
    ----------
    api : _type_
        _description_
    """
    name = "testing"

    # Get or create a cluster
    cluster = client.get_or_create_cluster(name)

    # Add documents to the cluster
    cluster.add(
        documents=[
            "This is document",
            "This is Towhid",
            "This is text",
        ],
        metadatas=[
            {"source": "notion"},
            {"source": "notion"},
            {"source": "google-doc"},
        ],
        ids=[str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4())],
    )

    # Query the cluster for similar results
    results = cluster.find(
        query_texts=["This"],
        n_results=5,
        where={"source": "notion"},
        where_document={"$contains": "is"},
    )

    print(results)
    print(">> create_add_find done  !\n")
def create_add_find(client): """ 创建、添加和查找 参数 ---------- api : _type_ _description_ """ name = "testing" # 获取或创建集群 cluster = client.get_or_create_cluster(name) # 向集群添加文档 cluster.add( documents=[ "This is document", "This is Towhid", "This is text", ], metadatas=[ {"source": "notion"}, {"source": "notion"}, {"source": "google-doc"}, ], ids=[str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4())], ) # 查询集群以获取相似结果 results = cluster.find( query_texts=["This"], n_results=5, where={"source": "notion"}, where_document={"$contains": "is"}, ) print(results) print(">> create_add_find done !\n")

创建 - 添加 - 按嵌入查找¶

输入 [ ]

已复制！





def create_add_find_em(client):
    """Create, add, & find embeddings

    Parameters
    ----------
    api : _type_
        _description_
    """
    name = "testing_embeddings"
    # Reset the Bagel server
    client.reset()

    # Get or create a cluster
    cluster = api.get_or_create_cluster(name)
    # Add embeddings and other data to the cluster
    cluster.add(
        embeddings=[
            [1.1, 2.3, 3.2],
            [4.5, 6.9, 4.4],
            [1.1, 2.3, 3.2],
            [4.5, 6.9, 4.4],
            [1.1, 2.3, 3.2],
            [4.5, 6.9, 4.4],
            [1.1, 2.3, 3.2],
            [4.5, 6.9, 4.4],
        ],
        metadatas=[
            {"uri": "img1.png", "style": "style1"},
            {"uri": "img2.png", "style": "style2"},
            {"uri": "img3.png", "style": "style1"},
            {"uri": "img4.png", "style": "style1"},
            {"uri": "img5.png", "style": "style1"},
            {"uri": "img6.png", "style": "style1"},
            {"uri": "img7.png", "style": "style1"},
            {"uri": "img8.png", "style": "style1"},
        ],
        documents=[
            "doc1",
            "doc2",
            "doc3",
            "doc4",
            "doc5",
            "doc6",
            "doc7",
            "doc8",
        ],
        ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
    )

    # Query the cluster for results
    results = cluster.find(query_embeddings=[[1.1, 2.3, 3.2]], n_results=5)

    print("find result:", results)
    print(">> create_add_find_em done  !\n")
def create_add_find_em(client): """创建、添加和查找嵌入 参数 ---------- api : _type_ _description_ """ name = "testing_embeddings" # 重置 Bagel 服务器 client.reset() # 获取或创建集群 cluster = api.get_or_create_cluster(name) # 向集群添加嵌入和其他数据 cluster.add( embeddings=[ [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], [4.5, 6.9, 4.4], ], metadatas=[ {"uri": "img1.png", "style": "style1"}, {"uri": "img2.png", "style": "style2"}, {"uri": "img3.png", "style": "style1"}, {"uri": "img4.png", "style": "style1"}, {"uri": "img5.png", "style": "style1"}, {"uri": "img6.png", "style": "style1"}, {"uri": "img7.png", "style": "style1"}, {"uri": "img8.png", "style": "style1"}, ], documents=[ "doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8", ], ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"], ) # 查询集群以获取结果 results = cluster.find(query_embeddings=[[1.1, 2.3, 3.2]], n_results=5) print("find result:", results) print(">> create_add_find_em done !\n")

创建 - 添加 - 修改 - 更新¶

输入 [ ]

已复制！





def create_add_modify_update(client):
    """
    Create, add, modify, and update

    Parameters
    ----------
    api : _type_
        _description_
    """
    name = "testing"
    new_name = "new_" + name

    # Get or create a cluster
    cluster = client.get_or_create_cluster(name)

    # Modify the cluster name
    print("Before:", cluster.name)
    cluster.modify(name=new_name)
    print("After:", cluster.name)

    # Add documents to the cluster
    cluster.add(
        documents=[
            "This is document1",
            "This is bidhan",
        ],
        metadatas=[{"source": "notion"}, {"source": "google"}],
        ids=["id1", "id2"],
    )

    # Retrieve document metadata before updating
    print("Before update:")
    print(cluster.get(ids=["id1"]))

    # Update document metadata
    cluster.update(ids=["id1"], metadatas=[{"source": "google"}])

    # Retrieve document metadata after updating
    print("After update source:")
    print(cluster.get(ids=["id1"]))

    print(">> create_add_modify_update done !\n")
def create_add_modify_update(client): """ 创建、添加、修改和更新 参数 ---------- api : _type_ _description_ """ name = "testing" new_name = "new_" + name # 获取或创建集群 cluster = client.get_or_create_cluster(name) # 修改集群名称 print("之前:", cluster.name) cluster.modify(name=new_name) print("之后:", cluster.name) # 向集群添加文档 cluster.add( documents=[ "This is document1", "This is bidhan", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id2"], ) # 更新前检索文档元数据 print("更新之前:") print(cluster.get(ids=["id1"])) # 更新文档元数据 cluster.update(ids=["id1"], metadatas=[{"source": "google"}]) # 更新 source 后检索文档元数据 print("更新 source 之后:") print(cluster.get(ids=["id1"])) print(">> create_add_modify_update done !\n")

创建 - 插入或更新¶

输入 [ ]

已复制！





def create_upsert(client):
    """
    Create and upsert

    Parameters
    ----------
    api : _type_
        _description_
    """
    # Reset the Bagel server
    api.reset()

    name = "testing"

    # Get or create a cluster
    cluster = client.get_or_create_cluster(name)

    # Add documents to the cluster
    cluster.add(
        documents=[
            "This is document1",
            "This is bidhan",
        ],
        metadatas=[{"source": "notion"}, {"source": "google"}],
        ids=["id1", "id2"],
    )

    # Upsert documents in the cluster
    cluster.upsert(
        documents=[
            "This is document",
            "This is google",
        ],
        metadatas=[{"source": "notion"}, {"source": "google"}],
        ids=["id1", "id3"],
    )

    # Print the count of documents in the cluster
    print("Count of documents:", cluster.count())
    print(">> create_upsert done !\n")
def create_upsert(client): """ 创建和插入或更新 参数 ---------- api : _type_ _description_ """ # 重置 Bagel 服务器 api.reset() name = "testing" # 获取或创建集群 cluster = client.get_or_create_cluster(name) # 向集群添加文档 cluster.add( documents=[ "This is document1", "This is bidhan", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id2"], ) # 在集群中插入或更新文档 cluster.upsert( documents=[ "This is document", "This is google", ], metadatas=[{"source": "notion"}, {"source": "google"}], ids=["id1", "id3"], ) # 打印集群中的文档数量 print("文档数量:", cluster.count()) print(">> create_upsert done !\n")