A knowledge graph is an organized representation of real-world entities and their relationships. With respect to organizing principles, knowledge graph stores data and relationships alongside frameworks known as organizing principles
Building a knowledge graph is simple:
- Gather the data
- Chunk the data
- Vectorize the data
- Pass the data to an LLM to extract nodes and relationships
- Use the output to generate the graph
Building a Knowledge Graph in Python
import os
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.graphs.graph_document import Node, Relationship
from dotenv import load_dotenv
load_dotenv()
DOCS_PATH = "llm-knowledge-graph/data/course/pdfs"
llm = ChatOpenAI(
openai_api_key=os.getenv('OPENAI_API_KEY'),
model_name="gpt-3.5-turbo"
)
embedding_provider = OpenAIEmbeddings(
openai_api_key=os.getenv('OPENAI_API_KEY'),
model="text-embedding-ada-002"
)
graph = Neo4jGraph(
url=os.getenv('NEO4J_URI'),
username=os.getenv('NEO4J_USERNAME'),
password=os.getenv('NEO4J_PASSWORD')
)
doc_transformer = LLMGraphTransformer(
llm=llm,
)
# Load and split the documents
loader = DirectoryLoader(DOCS_PATH, glob="**/*.pdf", loader_cls=PyPDFLoader)
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1500,
chunk_overlap=200,
)
docs = loader.load()
chunks = text_splitter.split_documents(docs)
for chunk in chunks:
filename = os.path.basename(chunk.metadata["source"])
chunk_id = f"{filename}.{chunk.metadata["page"]}"
print("Processing -", chunk_id)
# Embed the chunk
chunk_embedding = embedding_provider.embed_query(chunk.page_content)
# Add the Document and Chunk nodes to the graph
properties = {
"filename": filename,
"chunk_id": chunk_id,
"text": chunk.page_content,
"embedding": chunk_embedding
}
graph.query("""
MERGE (d:Document {id: $filename})
MERGE (c:Chunk {id: $chunk_id})
SET c.text = $text
MERGE (d)<-[:PART_OF]-(c)
WITH c
CALL db.create.setNodeVectorProperty(c, 'textEmbedding', $embedding)
""",
properties
)
# Generate the entities and relationships from the chunk
graph_docs = doc_transformer.convert_to_graph_documents([chunk])
# Map the entities in the graph documents to the chunk node
for graph_doc in graph_docs:
chunk_node = Node(
id=chunk_id,
type="Chunk"
)
for node in graph_doc.nodes:
graph_doc.relationships.append(
Relationship(
source=chunk_node,
target=node,
type="HAS_ENTITY"
)
)
# add the graph documents to the graph
graph.add_graph_documents(graph_docs)
# Create the vector index
graph.query("""
CREATE VECTOR INDEX `chunkVector`
IF NOT EXISTS
FOR (c: Chunk) ON (c.textEmbedding)
OPTIONS {indexConfig: {
`vector.dimensions`: 1536,
`vector.similarity_function`: 'cosine'
}};""")Query Knowledge Graphs with Cypher
- Count entity types
MATCH ()-[:HAS_ENTITY]->(e)
RETURN labels(e) AS labels, count(e) AS nodes
ORDER BY nodes DESC;- Most-mentioned Technology entities
MATCH ()-[r:HAS_ENTITY]->(e:Technology)
RETURN e.id AS entityId, count(r) AS mentions
ORDER BY mentions DESC;- Relationship types and counts between entities
MATCH ()-[:HAS_ENTITY]->(entity)-[r]->(other)<-[:HAS_ENTITY]-()
RETURN DISTINCT type(r) AS relType, count(r) AS number
ORDER BY number DESC;- Build simple statements: entity label/id + relation + other label/id
MATCH ()-[:HAS_ENTITY]->(entity)-[r]->(other)<-[:HAS_ENTITY]-()
RETURN DISTINCT labels(entity), entity.id, type(r), labels(other), other.id;- Find documents related to one document via shared entities
MATCH (d:Document { id: "your_doc_id.pdf" })<-[:PART_OF]-(c:Chunk)
MATCH (c)-[:HAS_ENTITY]->(entity)<-[:HAS_ENTITY]-(otherChunk)
MATCH (otherChunk)-[:PART_OF]->(otherDocument)
RETURN DISTINCT entity.id, otherDocument.id;- Restrict graph to entities that appear in a specific chunk/document (return paths)
MATCH (d:Document { id: "your_doc_id.pdf" })<-[:PART_OF]-(c:Chunk)
MATCH (c)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (c)-[:HAS_ENTITY]->(e2)
RETURN p;- Unwind relationships from paths to get node labels/ids and relation types
MATCH (d:Document { id: "your_doc_id.pdf" })<-[:PART_OF]-(c:Chunk)
MATCH (c)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (c)-[:HAS_ENTITY]->(e2)
UNWIND relationships(p) AS rels
RETURN
labels(startNode(rels))[0] AS eLabel,
startNode(rels).id AS eId,
type(rels) AS relType,
labels(endNode(rels))[0] AS e2Label,
endNode(rels).id AS e2Id;- Vector similarity: encode a user query, query node vector index (replace token)
WITH genai.vector.encode(
"Why do LLMs get things wrong?",
"OpenAI",
{ token: "sk-..." }
) AS userEmbedding
CALL db.index.vector.queryNodes('chunkVector', 6, userEmbedding)
YIELD node, score
RETURN node.text, score;- Vector search + include knowledge-graph context (entities & relations per chunk)
WITH genai.vector.encode(
"Why do LLMs get things wrong?",
"OpenAI",
{ token: "sk-..." }
) AS userEmbedding
CALL db.index.vector.queryNodes('chunkVector', 6, userEmbedding)
YIELD node, score
MATCH (node)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (node)-[:HAS_ENTITY]->(e2)
UNWIND relationships(p) AS rels
WITH node, score, collect([
labels(startNode(rels))[0],
startNode(rels).id,
type(rels),
labels(endNode(rels))[0],
endNode(rels).id
]) AS kg
RETURN node.text, score, kg;Query Knowledge Graphs with LLMs
import os
from langchain_openai import ChatOpenAI
from langchain_neo4j import GraphCypherQAChain, Neo4jGraph
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
load_dotenv()
llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0)
graph = Neo4jGraph(url=os.getenv("NEO4J_URI"),
username=os.getenv("NEO4J_USERNAME"),
password=os.getenv("NEO4J_PASSWORD"))
CYPHER_GENERATION_TEMPLATE = """Task: Generate Cypher statement to query a graph database.
Instructions:
- Use only provided relationship types/properties.
- Only return the Cypher statement.
- Use case-insensitive matching.
Schema:
{schema}
The question is:
{question}"""
cypher_prompt = PromptTemplate(template=CYPHER_GENERATION_TEMPLATE,
input_variables=["schema", "question"])
cypher_chain = GraphCypherQAChain.from_llm(
llm,
graph=graph,
cypher_prompt=cypher_prompt,
verbose=True,
allow_dangerous_requests=True
)
def run_cypher(q):
return cypher_chain.invoke({"query": q})
while (q := input("> ")) != "exit":
print(run_cypher(q))- allow_dangerous_requests=True lets the LLM generate write/privileged Cypher — in production use read-only roles / RBAC.
- Add case-insensitive example(s) and graph-navigation examples to the prompt. Example additions:
# Case-insensitive matching example
MATCH (c:Chunk)-[:HAS_ENTITY]->(e)
WHERE e.id =~ '(?i)entityName'
# Find documents that reference an entity
MATCH (d:Document)<-[:PART_OF]-(:Chunk)-[:HAS_ENTITY]->(e)
WHERE e.id =~ '(?i)entityName'
RETURN d- GraphCypherQAChain config options
- exclude_types=[“Session”, “Message”,…] — omit node/rel types (e.g., conversation data).
- enhanced_schema=True — include sample property values in schema fed to LLM (can improve accuracy; larger prompt & slower).
- Separate LLMs: use a deterministic model (temp=0) for Cypher; cheaper/faster model for QA
qa_llm = ChatOpenAI(openai_api_key=..., model="gpt-3.5-turbo")
cypher_llm = ChatOpenAI(openai_api_key=..., model="gpt-4", temperature=0)
cypher_chain = GraphCypherQAChain.from_llm(
qa_llm=qa_llm, cypher_llm=cypher_llm, graph=graph,
cypher_prompt=cypher_prompt, allow_dangerous_requests=True)Integrating Retriever
This acts as a bridge to the application and knowledge graph
Vector search returns relevant chunks; retrieval_query adds KG metadata (document id, entities/relations) for LLM consumption
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph, Neo4jVector
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0)
emb = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
graph = Neo4jGraph(
url=os.getenv("NEO4J_URI"),
username=os.getenv("NEO4J_USERNAME"),
password=os.getenv("NEO4J_PASSWORD"),
)
chunk_vector = Neo4jVector.from_existing_index(
emb,
graph=graph,
index_name="chunkVector",
embedding_node_property="textEmbedding",
text_node_property="text",
retrieval_query="""
// get document and KG context for the node
MATCH (node)-[:PART_OF]->(d:Document)
WITH node, score, d
MATCH (node)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (node)-[:HAS_ENTITY]->(e2)
UNWIND relationships(p) AS rels
WITH node, score, d,
collect(apoc.text.join(
[labels(startNode(rels))[0], startNode(rels).id, type(rels),
labels(endNode(rels))[0], endNode(rels).id], " ")) AS kg
RETURN node.text AS text, score,
{ document: d.id, entities: kg } AS metadata
"""
)
instructions = (
"Use the given context to answer the question. Include the document id and "
"relevant info. If unknown, say you don't know. Context: {context}"
)
prompt = ChatPromptTemplate.from_messages([("system", instructions), ("human", "{input}")])
vector_retriever = chunk_vector.as_retriever()
chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(vector_retriever, chain)
def find_chunk(q):
return retrieval_chain.invoke({"input": q})
if __name__ == "__main__":
while (q := input("> ")) != "exit":
print(find_chunk(q))