Pgvector-example-emna/vector.py at main · TunNetCom/Pgvector-example-emna · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from langchain_community.vectorstores.pgvector import PGVector
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
import pandas as pd
import requests
import os

# Custom remote embedding using Ollama
class RemoteOllamaEmbeddings(Embeddings):
    def __init__(self, model="mxbai-embed-large", url="http://localhost:11434"):
        self.model = model
        self.url = url

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            res = requests.post(
                f"{self.url}/api/embeddings",
                json={"model": self.model, "prompt": text}
            )
            if res.status_code == 200:
                embeddings.append(res.json()["embedding"])
            else:
                raise Exception(f"Embedding failed: {res.text}")
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

# Load data
df = pd.read_csv("realistic_restaurant_reviews.csv")

# Instantiate embedding function
embedding_function = RemoteOllamaEmbeddings()

# Postgres connection config
CONNECTION_STRING = "postgresql+psycopg2://postgres:postgres@localhost:5432/mydb"

# Unique collection name
COLLECTION_NAME = "restaurant_reviews_pg"

# Check if you need to populate the DB
add_documents = True  # or make it smarter using a table check

if add_documents:
    documents = []
    ids = []

    for i, row in df.iterrows():
        document = Document(
            page_content=row["Title"] + " " + row["Review"],
            metadata={"rating": row["Rating"], "date": row["Date"]},
            id=str(i)
        )
        ids.append(str(i))
        documents.append(document)

    # Save documents to PGVector
    vectorstore = PGVector.from_documents(
        embedding=embedding_function,
        documents=documents,
        collection_name=COLLECTION_NAME,
        connection_string=CONNECTION_STRING
    )

else:
    # Load existing vectorstore
    vectorstore = PGVector(
        embedding_function=embedding_function,
        collection_name=COLLECTION_NAME,
        connection_string=CONNECTION_STRING
    )

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})