A knowledge graph is an organized representation of real-world entities and their relationships. With respect to organizing principles, knowledge graph stores data and relationships alongside frameworks known as organizing principles
Building a knowledge graph is simple:

  1. Gather the data
  2. Chunk the data
  3. Vectorize the data
  4. Pass the data to an LLM to extract nodes and relationships
  5. Use the output to generate the graph

Building a Knowledge Graph in Python

import os
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.graphs.graph_document import Node, Relationship
 
from dotenv import load_dotenv
load_dotenv()
 
DOCS_PATH = "llm-knowledge-graph/data/course/pdfs"
 
llm = ChatOpenAI(
    openai_api_key=os.getenv('OPENAI_API_KEY'), 
    model_name="gpt-3.5-turbo"
)
 
embedding_provider = OpenAIEmbeddings(
    openai_api_key=os.getenv('OPENAI_API_KEY'),
    model="text-embedding-ada-002"
    )
 
graph = Neo4jGraph(
    url=os.getenv('NEO4J_URI'),
    username=os.getenv('NEO4J_USERNAME'),
    password=os.getenv('NEO4J_PASSWORD')
)
 
doc_transformer = LLMGraphTransformer(
    llm=llm,
    )
 
# Load and split the documents
loader = DirectoryLoader(DOCS_PATH, glob="**/*.pdf", loader_cls=PyPDFLoader)
 
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1500,
    chunk_overlap=200,
)
 
docs = loader.load()
chunks = text_splitter.split_documents(docs)
 
for chunk in chunks:
 
    filename = os.path.basename(chunk.metadata["source"])
    chunk_id = f"{filename}.{chunk.metadata["page"]}"
    print("Processing -", chunk_id)
 
    # Embed the chunk
    chunk_embedding = embedding_provider.embed_query(chunk.page_content)
 
    # Add the Document and Chunk nodes to the graph
    properties = {
        "filename": filename,
        "chunk_id": chunk_id,
        "text": chunk.page_content,
        "embedding": chunk_embedding
    }
    
    graph.query("""
        MERGE (d:Document {id: $filename})
        MERGE (c:Chunk {id: $chunk_id})
        SET c.text = $text
        MERGE (d)<-[:PART_OF]-(c)
        WITH c
        CALL db.create.setNodeVectorProperty(c, 'textEmbedding', $embedding)
        """, 
        properties
    )
 
    # Generate the entities and relationships from the chunk
    graph_docs = doc_transformer.convert_to_graph_documents([chunk])
 
    # Map the entities in the graph documents to the chunk node
    for graph_doc in graph_docs:
        chunk_node = Node(
            id=chunk_id,
            type="Chunk"
        )
 
        for node in graph_doc.nodes:
 
            graph_doc.relationships.append(
                Relationship(
                    source=chunk_node,
                    target=node, 
                    type="HAS_ENTITY"
                    )
                )
 
    # add the graph documents to the graph
    graph.add_graph_documents(graph_docs)
 
# Create the vector index
graph.query("""
    CREATE VECTOR INDEX `chunkVector`
    IF NOT EXISTS
    FOR (c: Chunk) ON (c.textEmbedding)
    OPTIONS {indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
    }};""")

Query Knowledge Graphs with Cypher

  1. Count entity types
MATCH ()-[:HAS_ENTITY]->(e)
RETURN labels(e) AS labels, count(e) AS nodes
ORDER BY nodes DESC;
  1. Most-mentioned Technology entities
MATCH ()-[r:HAS_ENTITY]->(e:Technology)
RETURN e.id AS entityId, count(r) AS mentions
ORDER BY mentions DESC;
  1. Relationship types and counts between entities
MATCH ()-[:HAS_ENTITY]->(entity)-[r]->(other)<-[:HAS_ENTITY]-()
RETURN DISTINCT type(r) AS relType, count(r) AS number
ORDER BY number DESC;
  1. Build simple statements: entity label/id + relation + other label/id
MATCH ()-[:HAS_ENTITY]->(entity)-[r]->(other)<-[:HAS_ENTITY]-()
RETURN DISTINCT labels(entity), entity.id, type(r), labels(other), other.id;
  1. Find documents related to one document via shared entities
MATCH (d:Document { id: "your_doc_id.pdf" })<-[:PART_OF]-(c:Chunk)
MATCH (c)-[:HAS_ENTITY]->(entity)<-[:HAS_ENTITY]-(otherChunk)
MATCH (otherChunk)-[:PART_OF]->(otherDocument)
RETURN DISTINCT entity.id, otherDocument.id;
  1. Restrict graph to entities that appear in a specific chunk/document (return paths)
MATCH (d:Document { id: "your_doc_id.pdf" })<-[:PART_OF]-(c:Chunk)
MATCH (c)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (c)-[:HAS_ENTITY]->(e2)
RETURN p;
  1. Unwind relationships from paths to get node labels/ids and relation types
MATCH (d:Document { id: "your_doc_id.pdf" })<-[:PART_OF]-(c:Chunk)
MATCH (c)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (c)-[:HAS_ENTITY]->(e2)
UNWIND relationships(p) AS rels
RETURN
  labels(startNode(rels))[0] AS eLabel,
  startNode(rels).id AS eId,
  type(rels) AS relType,
  labels(endNode(rels))[0] AS e2Label,
  endNode(rels).id AS e2Id;
  1. Vector similarity: encode a user query, query node vector index (replace token)
WITH genai.vector.encode(
  "Why do LLMs get things wrong?",
  "OpenAI",
  { token: "sk-..." }
) AS userEmbedding
CALL db.index.vector.queryNodes('chunkVector', 6, userEmbedding)
YIELD node, score
RETURN node.text, score;
  1. Vector search + include knowledge-graph context (entities & relations per chunk)
WITH genai.vector.encode(
  "Why do LLMs get things wrong?",
  "OpenAI",
  { token: "sk-..." }
) AS userEmbedding
CALL db.index.vector.queryNodes('chunkVector', 6, userEmbedding)
YIELD node, score
MATCH (node)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (node)-[:HAS_ENTITY]->(e2)
UNWIND relationships(p) AS rels
WITH node, score, collect([
    labels(startNode(rels))[0],
    startNode(rels).id,
    type(rels),
    labels(endNode(rels))[0],
    endNode(rels).id
]) AS kg
RETURN node.text, score, kg;

Query Knowledge Graphs with LLMs

import os
from langchain_openai import ChatOpenAI
from langchain_neo4j import GraphCypherQAChain, Neo4jGraph
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
 
load_dotenv()
llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0)
graph = Neo4jGraph(url=os.getenv("NEO4J_URI"),
                   username=os.getenv("NEO4J_USERNAME"),
                   password=os.getenv("NEO4J_PASSWORD"))
 
CYPHER_GENERATION_TEMPLATE = """Task: Generate Cypher statement to query a graph database.
Instructions:
- Use only provided relationship types/properties.
- Only return the Cypher statement.
- Use case-insensitive matching.
Schema:
{schema}
The question is:
{question}"""
 
cypher_prompt = PromptTemplate(template=CYPHER_GENERATION_TEMPLATE,
                              input_variables=["schema", "question"])
 
cypher_chain = GraphCypherQAChain.from_llm(
    llm,
    graph=graph,
    cypher_prompt=cypher_prompt,
    verbose=True,
    allow_dangerous_requests=True
)
 
def run_cypher(q):
    return cypher_chain.invoke({"query": q})
 
while (q := input("> ")) != "exit":
    print(run_cypher(q))
  • allow_dangerous_requests=True lets the LLM generate write/privileged Cypher — in production use read-only roles / RBAC.
  • Add case-insensitive example(s) and graph-navigation examples to the prompt. Example additions:
# Case-insensitive matching example
MATCH (c:Chunk)-[:HAS_ENTITY]->(e)
WHERE e.id =~ '(?i)entityName'
 
# Find documents that reference an entity
MATCH (d:Document)<-[:PART_OF]-(:Chunk)-[:HAS_ENTITY]->(e)
WHERE e.id =~ '(?i)entityName'
RETURN d
  • GraphCypherQAChain config options
    • exclude_types=[“Session”, “Message”,…] — omit node/rel types (e.g., conversation data).
    • enhanced_schema=True — include sample property values in schema fed to LLM (can improve accuracy; larger prompt & slower).
    • Separate LLMs: use a deterministic model (temp=0) for Cypher; cheaper/faster model for QA
qa_llm = ChatOpenAI(openai_api_key=..., model="gpt-3.5-turbo")
cypher_llm = ChatOpenAI(openai_api_key=..., model="gpt-4", temperature=0)
cypher_chain = GraphCypherQAChain.from_llm(
    qa_llm=qa_llm, cypher_llm=cypher_llm, graph=graph,
    cypher_prompt=cypher_prompt, allow_dangerous_requests=True)

Integrating Retriever

This acts as a bridge to the application and knowledge graph
Vector search returns relevant chunks; retrieval_query adds KG metadata (document id, entities/relations) for LLM consumption

import os
from dotenv import load_dotenv
load_dotenv()
 
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph, Neo4jVector
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
 
llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0)
emb = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
 
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
)
 
chunk_vector = Neo4jVector.from_existing_index(
    emb,
    graph=graph,
    index_name="chunkVector",
    embedding_node_property="textEmbedding",
    text_node_property="text",
    retrieval_query="""
// get document and KG context for the node
MATCH (node)-[:PART_OF]->(d:Document)
WITH node, score, d
MATCH (node)-[:HAS_ENTITY]->(e)
MATCH p = (e)-[r]-(e2)
WHERE (node)-[:HAS_ENTITY]->(e2)
UNWIND relationships(p) AS rels
WITH node, score, d,
  collect(apoc.text.join(
    [labels(startNode(rels))[0], startNode(rels).id, type(rels),
     labels(endNode(rels))[0], endNode(rels).id], " ")) AS kg
RETURN node.text AS text, score,
  { document: d.id, entities: kg } AS metadata
"""
)
 
instructions = (
    "Use the given context to answer the question. Include the document id and "
    "relevant info. If unknown, say you don't know. Context: {context}"
)
prompt = ChatPromptTemplate.from_messages([("system", instructions), ("human", "{input}")])
 
vector_retriever = chunk_vector.as_retriever()
chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(vector_retriever, chain)
 
def find_chunk(q):
    return retrieval_chain.invoke({"input": q})
 
if __name__ == "__main__":
    while (q := input("> ")) != "exit":
        print(find_chunk(q))