WordLift 向量存储¶

介绍¶

此脚本演示了如何抓取产品网站，提取相关信息，构建 SEO 友好的知识图谱（PDP 和 PLP 的结构化表示），并利用它来改善搜索和用户体验。

主要特性与库:¶

Web 抓取 (Advertools)
产品详情页 (PDP) 和产品列表页 (PLP) 的知识图谱创建 - WordLift
产品推荐 (WordLift 神经搜索)
购物助手创建 (WordLift + LlamaIndex 🦙)

这种方法可以增强电子商务网站的 SEO 性能和用户参与度。

在此处了解更多工作原理

作者： Andrea Volpini 和 David Riccitelli

MIT 许可证

最后更新：2024年7月31日

设置¶

In [ ]

已复制!

!pip install advertools -q
!pip install -U wordlift-client # 🎉 first time on stage 🎉
!pip install rdflib -q
!pip install advertools -q !pip install -U wordlift-client # 🎉 first time on stage 🎉 !pip install rdflib -q

In [ ]

已复制!





# Standard library imports
import json
import logging
import os
import re
import urllib.parse
import requests
from typing import List, Optional

# Third-party imports
import advertools as adv
import pandas as pd
import nest_asyncio

# RDFLib imports
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import SDO, Namespace, DefinedNamespace

# WordLift client imports
import wordlift_client
from wordlift_client import Configuration, ApiClient
from wordlift_client.rest import ApiException
from wordlift_client.api.dataset_api import DatasetApi
from wordlift_client.api.entities_api import EntitiesApi
from wordlift_client.api.graph_ql_api import GraphQLApi
from wordlift_client.models.graphql_request import GraphqlRequest
from wordlift_client.models.page_vector_search_query_response_item import (
    PageVectorSearchQueryResponseItem,
)
from wordlift_client.models.vector_search_query_request import (
    VectorSearchQueryRequest,
)
from wordlift_client.api.vector_search_queries_api import (
    VectorSearchQueriesApi,
)


# Asynchronous programming
import asyncio

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Apply nest_asyncio
nest_asyncio.apply()
# Standard library imports import json import logging import os import re import urllib.parse import requests from typing import List, Optional # Third-party imports import advertools as adv import pandas as pd import nest_asyncio # RDFLib imports from rdflib import Graph, Literal, RDF, URIRef from rdflib.namespace import SDO, Namespace, DefinedNamespace # WordLift client imports import wordlift_client from wordlift_client import Configuration, ApiClient from wordlift_client.rest import ApiException from wordlift_client.api.dataset_api import DatasetApi from wordlift_client.api.entities_api import EntitiesApi from wordlift_client.api.graph_ql_api import GraphQLApi from wordlift_client.models.graphql_request import GraphqlRequest from wordlift_client.models.page_vector_search_query_response_item import ( PageVectorSearchQueryResponseItem, ) from wordlift_client.models.vector_search_query_request import ( VectorSearchQueryRequest, ) from wordlift_client.api.vector_search_queries_api import ( VectorSearchQueriesApi, ) # Asynchronous programming import asyncio # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Apply nest_asyncio nest_asyncio.apply()

In [ ]

已复制!

WORDLIFT_KEY = os.getenv("WORDLIFT_KEY")
OPENAI_KEY = os.getenv("OPENAI_KEY")
WORDLIFT_KEY = os.getenv("WORDLIFT_KEY") OPENAI_KEY = os.getenv("OPENAI_KEY")

使用 Advertools 抓取网站¶

In [ ]

已复制!





# Step 1: Define the website structure
# -----------------------------------

# We're working with two types of pages:
# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/
# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/

# The product description can be found at this XPath:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()
# The price is here:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()
# The category is here:
# //span[contains(@class, 'breadcrumb')]/a/text()

# Step 2: Set up the crawl
# ------------------------


def crawl_website(url, output_file, num_pages=10):
    logger.info(f"Starting crawl of {url}")
    adv.crawl(
        url,
        output_file,
        follow_links=True,
        custom_settings={
            "CLOSESPIDER_PAGECOUNT": num_pages,
            "USER_AGENT": "WordLiftBot/1.0 (Maven Project)",
            "CONCURRENT_REQUESTS_PER_DOMAIN": 2,
            "DOWNLOAD_DELAY": 1,
            "ROBOTSTXT_OBEY": False,
        },
        xpath_selectors={
            "product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()",
            "product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()",
            "product_category": "//span[@class='posted_in']/a/text()",
        },
    )
    logger.info(f"Crawl completed. Results saved to {output_file}")


# Step 3: Analyze URL patterns
# ----------------------------


def analyze_url_patterns(df):
    df["page_type"] = df["url"].apply(
        lambda x: "PLP"
        if "/product-category/" in x
        else ("PDP" if "/product/" in x else "Other")
    )
    logger.info(
        f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs"
    )
    return df


# Step 4: Extract page data
# ----------------------------


def extract_page_data(df):
    extracted_data = []
    for _, row in df.iterrows():
        page = {
            "url": row["url"],
            "title": row["title"],
            "page_type": row["page_type"],
            "meta_description": row.get("meta_description", ""),
            "og_title": row.get("og_title", ""),
            "og_description": row.get("og_description", ""),
            "h1": ", ".join(row.get("h1", []))
            if isinstance(row.get("h1"), list)
            else row.get("h1", ""),
            "h2": ", ".join(row.get("h2", []))
            if isinstance(row.get("h2"), list)
            else row.get("h2", ""),
        }

        if row["page_type"] == "PDP":
            page.update(
                {
                    "product_description": ", ".join(
                        row.get("product_description", [])
                    )
                    if isinstance(row.get("product_description"), list)
                    else row.get("product_description", ""),
                    "product_price": ", ".join(row.get("product_price", []))
                    if isinstance(row.get("product_price"), list)
                    else row.get("product_price", ""),
                    "product_category": ", ".join(
                        row.get("product_category", [])
                    )
                    if isinstance(row.get("product_category"), list)
                    else row.get("product_category", ""),
                }
            )
        elif row["page_type"] == "PLP":
            # Parse the category from the H1 content
            h1_content = (
                row.get("h1", [""])[0]
                if isinstance(row.get("h1"), list)
                else row.get("h1", "")
            )
            category = (
                h1_content.split("@@")[-1]
                if "@@" in h1_content
                else h1_content.replace("Category: ", "").strip()
            )
            page["category_name"] = category

        extracted_data.append(page)

    return pd.DataFrame(extracted_data)
# Step 1: Define the website structure # ----------------------------------- # We're working with two types of pages: # 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/ # 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/ # The product description can be found at this XPath: # /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text() # The price is here: # /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text() # The category is here: # //span[contains(@class, 'breadcrumb')]/a/text() # Step 2: Set up the crawl # ------------------------ def crawl_website(url, output_file, num_pages=10): logger.info(f"Starting crawl of {url}") adv.crawl( url, output_file, follow_links=True, custom_settings={ "CLOSESPIDER_PAGECOUNT": num_pages, "USER_AGENT": "WordLiftBot/1.0 (Maven Project)", "CONCURRENT_REQUESTS_PER_DOMAIN": 2, "DOWNLOAD_DELAY": 1, "ROBOTSTXT_OBEY": False, }, xpath_selectors={ "product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()", "product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()", "product_category": "//span[@class='posted_in']/a/text()", }, ) logger.info(f"Crawl completed. Results saved to {output_file}") # Step 3: Analyze URL patterns # ---------------------------- def analyze_url_patterns(df): df["page_type"] = df["url"].apply( lambda x: "PLP" if "/product-category/" in x else ("PDP" if "/product/" in x else "Other") ) logger.info( f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs" ) return df # Step 4: Extract page data # ---------------------------- def extract_page_data(df): extracted_data = [] for _, row in df.iterrows(): page = { "url": row["url"], "title": row["title"], "page_type": row["page_type"], "meta_description": row.get("meta_description", ""), "og_title": row.get("og_title", ""), "og_description": row.get("og_description", ""), "h1": ", ".join(row.get("h1", [])) if isinstance(row.get("h1"), list) else row.get("h1", ""), "h2": ", ".join(row.get("h2", [])) if isinstance(row.get("h2"), list) else row.get("h2", ""), } if row["page_type"] == "PDP": page.update( { "product_description": ", ".join( row.get("product_description", []) ) if isinstance(row.get("product_description"), list) else row.get("product_description", ""), "product_price": ", ".join(row.get("product_price", [])) if isinstance(row.get("product_price"), list) else row.get("product_price", ""), "product_category": ", ".join( row.get("product_category", []) ) if isinstance(row.get("product_category"), list) else row.get("product_category", ""), } ) elif row["page_type"] == "PLP": # Parse the category from the H1 content h1_content = ( row.get("h1", [""])[0] if isinstance(row.get("h1"), list) else row.get("h1", "") ) category = ( h1_content.split("@@")[-1] if "@@" in h1_content else h1_content.replace("Category: ", "").strip() ) page["category_name"] = category extracted_data.append(page) return pd.DataFrame(extracted_data)

使用 WordLift 构建知识图谱 🕸¶

In [ ]

已复制!

# Step 5: Configure the WordLift client
# ----------------------------

# Create a configuration object for the WordLift API client using your WordLift key.
configuration = Configuration(host="https://api.wordlift.io")
configuration.api_key["ApiKey"] = WORDLIFT_KEY
configuration.api_key_prefix["ApiKey"] = "Key"

EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/")

BASE_URI = "http://data.wordlift.io/[dataset_id]/"
# Step 5: Configure the WordLift client # ---------------------------- # Create a configuration object for the WordLift API client using your WordLift key. configuration = Configuration(host="https://api.wordlift.io") configuration.api_key["ApiKey"] = WORDLIFT_KEY configuration.api_key_prefix["ApiKey"] = "Key" EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/") BASE_URI = "http://data.wordlift.io/[dataset_id]/"

In [ ]

已复制!





# Step 6: Build the KG and the embeddings
# ----------------------------


async def cleanup_knowledge_graph(api_client):
    dataset_api = wordlift_client.DatasetApi(api_client)
    try:
        # Delete all
        await dataset_api.delete_all_entities()
    except Exception as e:
        print(
            "Exception when calling DatasetApi->delete_all_entities: %s\n" % e
        )


async def create_entity(entities_api, entity_data):
    g = Graph().parse(data=json.dumps(entity_data), format="json-ld")
    body = g.serialize(format="application/rdf+xml")
    await entities_api.create_or_update_entities(
        body=body, _content_type="application/rdf+xml"
    )


def replace_url(original_url: str) -> str:
    old_domain = "https://product-finder.wordlift.io/"
    new_domain = "https://data-science-with-python-for-seo.wordlift.dev/"

    if original_url.startswith(old_domain):
        return original_url.replace(old_domain, new_domain, 1)
    else:
        return original_url


def create_entity_uri(url):
    parsed_url = urllib.parse.urlparse(url)
    path = parsed_url.path.strip("/")
    path_parts = path.split("/")
    fragment = parsed_url.fragment

    if "product" in path_parts:
        # It's a product page or product offer
        product_id = path_parts[-1]  # Get the last part of the path
        if fragment == "offer":
            return f"{BASE_URI}offer_{product_id}"
        else:
            return f"{BASE_URI}product_{product_id}"
    elif "product-category" in path_parts:
        # It's a product listing page (PLP)
        category = path_parts[-1]  # Get the last part of the path
        return f"{BASE_URI}plp_{category}"
    else:
        # For any other type of page
        safe_path = "".join(c if c.isalnum() else "_" for c in path)
        if fragment == "offer":
            return f"{BASE_URI}offer_{safe_path}"
        else:
            return f"{BASE_URI}page_{safe_path}"


def clean_price(price_str):
    if not price_str or price_str == "N/A":
        return None
    if isinstance(price_str, (int, float)):
        return float(price_str)
    try:
        # Remove any non-numeric characters except for the decimal point
        cleaned_price = "".join(
            char for char in str(price_str) if char.isdigit() or char == "."
        )
        return float(cleaned_price)
    except ValueError:
        logger.warning(f"Could not convert price: {price_str}")
        return None


def create_product_entity(row, dataset_uri):
    url = replace_url(row["url"])
    product_entity_uri = create_entity_uri(url)

    entity_data = {
        "@context": "http://schema.org",
        "@type": "Product",
        "@id": product_entity_uri,
        "url": url,
        "name": row["title"]
        if not pd.isna(row["title"])
        else "Untitled Product",
        "urn:meta:requestEmbeddings": [
            "http://schema.org/name",
            "http://schema.org/description",
        ],
    }

    if not pd.isna(row.get("product_description")):
        entity_data["description"] = row["product_description"]

    if not pd.isna(row.get("product_price")):
        price = clean_price(row["product_price"])
        if price is not None:
            # Create offer ID as a sub-resource of the product ID
            offer_entity_uri = f"{product_entity_uri}/offer_1"
            entity_data["offers"] = {
                "@type": "Offer",
                "@id": offer_entity_uri,
                "price": str(price),
                "priceCurrency": "GBP",
                "availability": "http://schema.org/InStock",
                "url": url,
            }

    if not pd.isna(row.get("product_category")):
        entity_data["category"] = row["product_category"]

    custom_attributes = {
        key: row[key]
        for key in [
            "meta_description",
            "og_title",
            "og_description",
            "h1",
            "h2",
        ]
        if not pd.isna(row.get(key))
    }
    if custom_attributes:
        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
            custom_attributes
        )

    return entity_data


def create_collection_entity(row, dataset_uri):
    url = replace_url(row["url"])
    entity_uri = create_entity_uri(url)

    entity_data = {
        "@context": "http://schema.org",
        "@type": "CollectionPage",
        "@id": entity_uri,
        "url": url,
        "name": row["category_name"] or row["title"],
    }

    custom_attributes = {
        key: row[key]
        for key in [
            "meta_description",
            "og_title",
            "og_description",
            "h1",
            "h2",
        ]
        if row.get(key)
    }
    if custom_attributes:
        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
            custom_attributes
        )

    return entity_data


async def build_knowledge_graph(df, dataset_uri, api_client):
    entities_api = EntitiesApi(api_client)

    for _, row in df.iterrows():
        try:
            if row["page_type"] == "PDP":
                entity_data = create_product_entity(row, dataset_uri)
            elif row["page_type"] == "PLP":
                entity_data = create_collection_entity(row, dataset_uri)
            else:
                logger.warning(
                    f"Skipping unknown page type for URL: {row['url']}"
                )
                continue

            if entity_data is None:
                logger.warning(
                    f"Skipping page due to missing critical data: {row['url']}"
                )
                continue

            await create_entity(entities_api, entity_data)
            logger.info(
                f"Created entity for {row['page_type']}: {row['title']}"
            )
        except Exception as e:
            logger.error(
                f"Error creating entity for {row['page_type']}: {row['title']}"
            )
            logger.error(f"Error: {str(e)}")
# Step 6: Build the KG and the embeddings # ---------------------------- async def cleanup_knowledge_graph(api_client): dataset_api = wordlift_client.DatasetApi(api_client) try: # Delete all await dataset_api.delete_all_entities() except Exception as e: print( "Exception when calling DatasetApi->delete_all_entities: %s\n" % e ) async def create_entity(entities_api, entity_data): g = Graph().parse(data=json.dumps(entity_data), format="json-ld") body = g.serialize(format="application/rdf+xml") await entities_api.create_or_update_entities( body=body, _content_type="application/rdf+xml" ) def replace_url(original_url: str) -> str: old_domain = "https://product-finder.wordlift.io/" new_domain = "https://data-science-with-python-for-seo.wordlift.dev/" if original_url.startswith(old_domain): return original_url.replace(old_domain, new_domain, 1) else: return original_url def create_entity_uri(url): parsed_url = urllib.parse.urlparse(url) path = parsed_url.path.strip("/") path_parts = path.split("/") fragment = parsed_url.fragment if "product" in path_parts: # It's a product page or product offer product_id = path_parts[-1] # Get the last part of the path if fragment == "offer": return f"{BASE_URI}offer_{product_id}" else: return f"{BASE_URI}product_{product_id}" elif "product-category" in path_parts: # It's a product listing page (PLP) category = path_parts[-1] # Get the last part of the path return f"{BASE_URI}plp_{category}" else: # For any other type of page safe_path = "".join(c if c.isalnum() else "_" for c in path) if fragment == "offer": return f"{BASE_URI}offer_{safe_path}" else: return f"{BASE_URI}page_{safe_path}" def clean_price(price_str): if not price_str or price_str == "N/A": return None if isinstance(price_str, (int, float)): return float(price_str) try: # Remove any non-numeric characters except for the decimal point cleaned_price = "".join( char for char in str(price_str) if char.isdigit() or char == "." ) return float(cleaned_price) except ValueError: logger.warning(f"Could not convert price: {price_str}") return None def create_product_entity(row, dataset_uri): url = replace_url(row["url"]) product_entity_uri = create_entity_uri(url) entity_data = { "@context": "http://schema.org", "@type": "Product", "@id": product_entity_uri, "url": url, "name": row["title"] if not pd.isna(row["title"]) else "Untitled Product", "urn:meta:requestEmbeddings": [ "http://schema.org/name", "http://schema.org/description", ], } if not pd.isna(row.get("product_description")): entity_data["description"] = row["product_description"] if not pd.isna(row.get("product_price")): price = clean_price(row["product_price"]) if price is not None: # Create offer ID as a sub-resource of the product ID offer_entity_uri = f"{product_entity_uri}/offer_1" entity_data["offers"] = { "@type": "Offer", "@id": offer_entity_uri, "price": str(price), "priceCurrency": "GBP", "availability": "http://schema.org/InStock", "url": url, } if not pd.isna(row.get("product_category")): entity_data["category"] = row["product_category"] custom_attributes = { key: row[key] for key in [ "meta_description", "og_title", "og_description", "h1", "h2", ] if not pd.isna(row.get(key)) } if custom_attributes: entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps( custom_attributes ) return entity_data def create_collection_entity(row, dataset_uri): url = replace_url(row["url"]) entity_uri = create_entity_uri(url) entity_data = { "@context": "http://schema.org", "@type": "CollectionPage", "@id": entity_uri, "url": url, "name": row["category_name"] or row["title"], } custom_attributes = { key: row[key] for key in [ "meta_description", "og_title", "og_description", "h1", "h2", ] if row.get(key) } if custom_attributes: entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps( custom_attributes ) return entity_data async def build_knowledge_graph(df, dataset_uri, api_client): entities_api = EntitiesApi(api_client) for _, row in df.iterrows(): try: if row["page_type"] == "PDP": entity_data = create_product_entity(row, dataset_uri) elif row["page_type"] == "PLP": entity_data = create_collection_entity(row, dataset_uri) else: logger.warning( f"Skipping unknown page type for URL: {row['url']}" ) continue if entity_data is None: logger.warning( f"Skipping page due to missing critical data: {row['url']}" ) continue await create_entity(entities_api, entity_data) logger.info( f"Created entity for {row['page_type']}: {row['title']}" ) except Exception as e: logger.error( f"Error creating entity for {row['page_type']}: {row['title']}" ) logger.error(f"Error: {str(e)}")

运行演示¶

In [ ]

已复制!





# ----------------------------
# Main Execution
# ----------------------------

# Global configuration variables
CRAWL_URL = "https://product-finder.wordlift.io/"
OUTPUT_FILE = "crawl_results.jl"


async def main():
    # Step 1: Crawl the website
    crawl_website(CRAWL_URL, OUTPUT_FILE)

    # Step 2: Load the crawled data
    df = pd.read_json(OUTPUT_FILE, lines=True)

    # Step 3: Analyze URL patterns
    df = analyze_url_patterns(df)

    # Step 4: Extract page data
    pages_df = extract_page_data(df)

    async with ApiClient(configuration) as api_client:
        # Clean up the existing knowledge graph
        try:
            await cleanup_knowledge_graph(api_client)
            logger.info(f"Knowledge Graph Cleaned Up")
        except Exception as e:
            logger.error(
                f"Failed to clean up the existing Knowledge Graph: {str(e)}"
            )
            return  # Exit if cleanup fails

        # Build the new knowledge graph
        await build_knowledge_graph(pages_df, CRAWL_URL, api_client)

    logger.info("Knowledge graph building completed.")


if __name__ == "__main__":
    asyncio.run(main())
# ---------------------------- # Main Execution # ---------------------------- # Global configuration variables CRAWL_URL = "https://product-finder.wordlift.io/" OUTPUT_FILE = "crawl_results.jl" async def main(): # Step 1: Crawl the website crawl_website(CRAWL_URL, OUTPUT_FILE) # Step 2: Load the crawled data df = pd.read_json(OUTPUT_FILE, lines=True) # Step 3: Analyze URL patterns df = analyze_url_patterns(df) # Step 4: Extract page data pages_df = extract_page_data(df) async with ApiClient(configuration) as api_client: # Clean up the existing knowledge graph try: await cleanup_knowledge_graph(api_client) logger.info(f"Knowledge Graph Cleaned Up") except Exception as e: logger.error( f"Failed to clean up the existing Knowledge Graph: {str(e)}" ) return # Exit if cleanup fails # Build the new knowledge graph await build_knowledge_graph(pages_df, CRAWL_URL, api_client) logger.info("Knowledge graph building completed.") if __name__ == "__main__": asyncio.run(main())

现在使用 GraphQL 查询知识图谱中的产品¶

In [ ]

已复制!





async def perform_graphql_query(api_client):
    graphql_api = GraphQLApi(api_client)
    query = """
    {
        products(rows: 20) {
            id: iri
            category: string(name:"schema:category")
            name: string(name:"schema:name")
            description: string(name:"schema:description")
            url: string(name:"schema:url")
        }
    }
    """
    request = GraphqlRequest(query=query)

    try:
        response = await graphql_api.graphql_using_post(body=request)
        print("GraphQL Query Results:")
        print(json.dumps(response, indent=2))
    except Exception as e:
        logger.error(f"An error occurred during GraphQL query: {e}")


async with ApiClient(configuration) as api_client:
    # Step 6: Perform GraphQL query
    await perform_graphql_query(api_client)
    logger.info("Knowledge graph building and GraphQL query completed.")
async def perform_graphql_query(api_client): graphql_api = GraphQLApi(api_client) query = """ { products(rows: 20) { id: iri category: string(name:"schema:category") name: string(name:"schema:name") description: string(name:"schema:description") url: string(name:"schema:url") } } """ request = GraphqlRequest(query=query) try: response = await graphql_api.graphql_using_post(body=request) print("GraphQL Query Results:") print(json.dumps(response, indent=2)) except Exception as e: logger.error(f"An error occurred during GraphQL query: {e}") async with ApiClient(configuration) as api_client: # Step 6: Perform GraphQL query await perform_graphql_query(api_client) logger.info("Knowledge graph building and GraphQL query completed.")

利用知识图谱¶

现在，我们已成功为我们的电子商务网站创建了知识图谱，并包含了产品嵌入，我们可以利用它来增强用户体验和功能。我们为每个产品生成的嵌入使我们能够执行语义相似性搜索并构建更智能的系统。

将结构化数据添加到网页¶

在本节中，我们将对 WordLift 的数据 API 进行一个简单的测试。此 API 用于将结构化数据标记从知识图谱 (KG) 注入到您的网页中。结构化数据有助于搜索引擎更好地理解您的内容，可能在搜索结果中显示富媒体摘要并改善 SEO。

对于此 Notebook，我们在一个演示电子商务网站上使用预配置的 KG。我们将引用一个虚构的 URL：https://data-science-with-python-for-seo.wordlift.dev。

调用 WordLift 的数据 API 时，我们只需传递一个 URL，即可接收相应的 JSON-LD (用于链接数据的 JavaScript 对象表示法)。此结构化数据通常包括电子商务网站的产品详情、价格和库存等信息。

下面的 get_json_ld_from_url() 函数演示了此过程。它将 URL 作为输入，并以 JSON-LD 格式返回结构化数据，可用于注入到您的网页中。

In [ ]

已复制!





def get_json_ld_from_url(url):
    # Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/'
    api_url = "https://api.wordlift.io/data/https/" + url.replace(
        "https://", ""
    )

    # Make the GET request to the API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON-LD from the response
        json_ld = response.json()
        return json_ld
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        return None


def pretty_print_json(json_obj):
    # Pretty print the JSON object
    print(json.dumps(json_obj, indent=4))
def get_json_ld_from_url(url): # Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/' api_url = "https://api.wordlift.io/data/https/" + url.replace( "https://", "" ) # Make the GET request to the API response = requests.get(api_url) # Check if the request was successful if response.status_code == 200: # Parse the JSON-LD from the response json_ld = response.json() return json_ld else: print(f"Failed to retrieve data: {response.status_code}") return None def pretty_print_json(json_obj): # Pretty print the JSON object print(json.dumps(json_obj, indent=4))

In [ ]

已复制!

# Let's run a test
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/"
json_ld = get_json_ld_from_url(url)
json_ld
# Let's run a test url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/" json_ld = get_json_ld_from_url(url) json_ld

使用 WordLift 神经搜索生成相似产品链接¶

有了产品嵌入，我们现在可以利用 WordLift 的神经搜索功能向用户推荐相似产品。此功能通过基于语义相似性展示相关产品，显著增强用户参与度并可能提高销售额。

与传统的关键词匹配不同，语义相似性考虑了产品描述的上下文和含义。即使产品不共享确切的关键词，这种方法也能实现更细致、更准确的推荐。

我们之前定义的 get_top_k_similar_urls 函数实现了此功能。它接受产品 URL 作为输入，并返回按相似性得分排序的语义相似产品列表。

例如，如果用户正在查看一件红色棉 T 恤，此功能可能会推荐其他颜色的棉 T 恤，或由不同材质制成的类似风格上衣。这为用户创造了更直观、更引人入胜的购物体验。

通过实现此神经搜索功能，我们能够创建更个性化、更高效的购物体验，从而可能提高用户满意度和转化率。

In [ ]

已复制!





async def get_top_k_similar_urls(configuration, query_url: str, top_k: int):
    request = VectorSearchQueryRequest(
        query_url=query_url,
        similarity_top_k=top_k,
    )

    async with wordlift_client.ApiClient(configuration) as api_client:
        api_instance = VectorSearchQueriesApi(api_client)
        try:
            page = await api_instance.create_query(
                vector_search_query_request=request
            )
            return [
                {
                    "url": item.id,
                    "name": item.text.split("\n")[0],
                    "score": item.score,
                }
                for item in page.items
                if item.id and item.text
            ]
        except Exception as e:
            logger.error(f"Error querying for entities: {e}", exc_info=True)
            return None
async def get_top_k_similar_urls(configuration, query_url: str, top_k: int): request = VectorSearchQueryRequest( query_url=query_url, similarity_top_k=top_k, ) async with wordlift_client.ApiClient(configuration) as api_client: api_instance = VectorSearchQueriesApi(api_client) try: page = await api_instance.create_query( vector_search_query_request=request ) return [ { "url": item.id, "name": item.text.split("\n")[0], "score": item.score, } for item in page.items if item.id and item.text ] except Exception as e: logger.error(f"Error querying for entities: {e}", exc_info=True) return None

In [ ]

已复制!





top_k = 10
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/"
similar_urls = await get_top_k_similar_urls(
    configuration, query_url=url, top_k=top_k
)
print(json.dumps(similar_urls, indent=2))
top_k = 10 url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/" similar_urls = await get_top_k_similar_urls( configuration, query_url=url, top_k=top_k ) print(json.dumps(similar_urls, indent=2))

使用 LlamaIndex 🦙 为电子商务网站构建聊天机器人¶

我们创建的知识图谱是构建智能聊天机器人的完美基础。LlamaIndex（以前称为 GPT Index）是一个强大的数据框架，允许我们在大型语言模型 (LLM) 中摄取、组织和访问私有或领域特定数据。借助 LlamaIndex，我们可以创建一个了解我们产品目录并能有效协助客户的上下文感知聊天机器人。

通过将 LlamaIndex 与我们的知识图谱结合使用，我们可以开发一个能够响应直接查询的聊天机器人。此聊天机器人将了解产品目录，使其能够：

回答有关产品规格、库存和价格的问题
根据客户偏好进行个性化产品推荐
提供相似产品之间的比较

这种方法可以实现与客户之间更自然、更有帮助的互动，从而增强他们的购物体验。聊天机器人可以利用知识图谱中的结构化数据，使用 LlamaIndex 通过 LLM 高效地检索和呈现相关信息。

在接下来的部分中，我们将逐步讲解如何使用我们的知识图谱数据设置 LlamaIndex，并创建一个能够智能协助电子商务客户的聊天机器人。

安装 `LlamaIndex` 和 `WordliftVectorStore` 💪¶

In [ ]

已复制!

%%capture
!pip install llama-index
!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'
!pip install llama-index-embeddings-nomic
%%capture !pip install llama-index !pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift' !pip install llama-index-embeddings-nomic

In [ ]

已复制!

# import the necessary modules
from llama_index.vector_stores.wordlift import WordliftVectorStore
from llama_index.core import VectorStoreIndex
# import the necessary modules from llama_index.vector_stores.wordlift import WordliftVectorStore from llama_index.core import VectorStoreIndex

为查询引擎设置 NomicEmbeddings¶

Nomic 发布了 v1.5 🪆🪆🪆 版本的嵌入模型，这显著改进了文本嵌入能力。嵌入是文本的数值表示，捕获语义含义，使我们的系统能够理解和比较查询和文档的内容。

Nomic v1.5 的主要特性包括

维度在 64 到 768 之间的可变大小嵌入
Matryoshka 学习，允许嵌套表示
上下文大小扩展到 8192 个 Token

由于这些高级特性，我们在 WordLift 中使用 NomicEmbeddings，现在我们也将其配置到 LlamaIndex 中，以便在编码用户查询时使用。这种跨技术栈的嵌入模型一致性确保了我们的知识图谱与查询理解过程之间更好的对齐。

可以在此处找到更多关于 NomicEmbeddings 的信息。

前往此处获取您的免费密钥。

In [ ]

已复制!

from llama_index.embeddings.nomic import NomicEmbedding

nomic_api_key = os.getenv("NOMIC_KEY")

embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)

embedding = embed_model.get_text_embedding("Hey Ho SEO!")
len(embedding)
from llama_index.embeddings.nomic import NomicEmbedding nomic_api_key = os.getenv("NOMIC_KEY") embed_model = NomicEmbedding( api_key=nomic_api_key, dimensionality=128, model_name="nomic-embed-text-v1.5", ) embedding = embed_model.get_text_embedding("Hey Ho SEO!") len(embedding)

我们将使用 OpenAI 作为默认的 LLM 来生成回复。当然，我们也可以使用任何其他可用的 LLM。

In [ ]

已复制!

# Set the environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_KEY
# Set the environment variable os.environ["OPENAI_API_KEY"] = OPENAI_KEY

现在使用知识图谱中的数据设置 WordliftVectorStore。

In [ ]

已复制!

# Let's configure WordliftVectorStore using our WL Key
vector_store = WordliftVectorStore(key=API_KEY)

# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
    vector_store, embed_model=embed_model
)

# Create a query engine
query_engine = index.as_query_engine()
# Let's configure WordliftVectorStore using our WL Key vector_store = WordliftVectorStore(key=API_KEY) # Create an index from the vector store index = VectorStoreIndex.from_vector_store( vector_store, embed_model=embed_model ) # Create a query engine query_engine = index.as_query_engine()

In [ ]

已复制!

query1 = "Can you give me a product similar to the facial puff? Please add the URL also"
result1 = query_engine.query(query1)

print(result1)
query1 = "Can you give me a product similar to the facial puff? Please add the URL also" result1 = query_engine.query(query1) print(result1)

In [ ]

已复制!





# Function to handle queries
def query_engine(query):
    # Create an index from the vector store
    index = VectorStoreIndex.from_vector_store(
        vector_store, embed_model=embed_model
    )

    # Create a query engine
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    return response


# Interactive query loop
while True:
    user_query = input("Enter your query (or 'quit' to exit): ")
    if user_query.lower() == "quit":
        break
    result = query_engine(user_query)
    print(result)
    print("\n---\n")
# Function to handle queries def query_engine(query): # Create an index from the vector store index = VectorStoreIndex.from_vector_store( vector_store, embed_model=embed_model ) # Create a query engine query_engine = index.as_query_engine() response = query_engine.query(query) return response # Interactive query loop while True: user_query = input("Enter your query (or 'quit' to exit): ") if user_query.lower() == "quit": break result = query_engine(user_query) print(result) print("\n---\n")

WordLift 向量存储¶

介绍¶

主要特性与库:¶

设置¶

使用 Advertools 抓取网站¶

使用 WordLift 构建知识图谱 🕸¶

运行演示¶

现在使用 GraphQL 查询知识图谱中的产品¶

利用知识图谱¶

将结构化数据添加到网页¶

使用 WordLift 神经搜索生成相似产品链接¶

使用 LlamaIndex 🦙 为电子商务网站构建聊天机器人¶

安装 LlamaIndex 和 WordliftVectorStore 💪¶

为查询引擎设置 NomicEmbeddings¶

安装 `LlamaIndex` 和 `WordliftVectorStore` 💪¶