Skip to content

Vector Search Guide

Overview

NetIntel-OCR integrates with Milvus to enable semantic search across processed documents, network components, and architectural patterns.

Milvus Setup

Docker Installation

# Start Milvus standalone
docker run -d \
  --name milvus-standalone \
  -p 19530:19530 \
  -p 9091:9091 \
  -v milvus_data:/var/lib/milvus \
  milvusdb/milvus:latest \
  milvus run standalone

Kubernetes Installation

# Install with Helm
helm repo add milvus https://milvus-io.github.io/milvus-helm/
helm install milvus milvus/milvus --set cluster.enabled=false

Document Ingestion

Basic Ingestion

# Process and ingest document
netintel-ocr db set-store milvus \
             --milvus-host localhost:19530 \
             --ingest \
             document.pdf

Batch Ingestion

# Ingest multiple documents
for pdf in *.pdf; do
  netintel-ocr process batch --ingest --collection network-docs "$pdf"
done

# Or use batch mode
netintel-ocr process batch --collection network-docs *.pdf

Collection Configuration

# Define collection schema
from netintel_ocr.vector import create_collection

create_collection(
    name="network_components",
    dim=768,  # Embedding dimension
    fields={
        "component_type": "VARCHAR",
        "security_zone": "VARCHAR", 
        "criticality": "INT32",
        "document_id": "VARCHAR",
        "page_number": "INT32"
    }
)

Embedding Generation

Component Embeddings

Documents are processed into multiple embedding types:

# Generated embeddings
embeddings = {
    "document": [...],      # Full document embedding
    "pages": [...],        # Per-page embeddings
    "components": [...],   # Network component embeddings
    "relationships": [...] # Connection embeddings
}

Custom Embedding Model

# Use specific embedding model
netintel-ocr config set embedding.model all-MiniLM-L6-v2 \
             --ingest document.pdf

# Or use Ollama embeddings
netintel-ocr config set embedding.model ollama/nomic-embed-text \
             --ingest document.pdf

Search Queries

# Search for components
netintel-ocr db search "firewall in DMZ zone"

# Search with filters
netintel-ocr db search "router" \
             --filter "security_zone=external" \
             --limit 10

Python API

from netintel_ocr.vector import VectorSearch

search = VectorSearch(host="localhost:19530")

# Semantic search
results = search.query(
    text="Find all components with internet exposure",
    collection="network_components",
    limit=20
)

# Filtered search
results = search.query(
    text="database servers",
    filter="criticality >= 8",
    limit=10
)

# Component relationship search
connections = search.find_connections(
    source="web_server",
    max_hops=3
)

REST API

# Search endpoint
curl -X POST http://localhost:8000/search \
  -H "Content-Type: application/json" \
  -d '{
    "query": "firewall configurations",
    "collection": "network_docs",
    "limit": 10,
    "filter": {
      "document_type": "security"
    }
  }'

Query Examples

Find Similar Network Architectures

# Upload reference architecture
reference = process_diagram("reference_architecture.png")

# Find similar architectures
similar = search.find_similar(
    embedding=reference.embedding,
    threshold=0.85,
    limit=5
)

Security Zone Analysis

# Find all components in DMZ
dmz_components = search.query(
    filter="security_zone='DMZ'",
    include_metadata=True
)

# Find cross-zone connections
cross_zone = search.query(
    text="connections between internal and external zones",
    include_relationships=True
)

Compliance Queries

# Find exposed services
exposed = search.query(
    text="services accessible from internet",
    filter="exposure='external'"
)

# Find unencrypted connections
unencrypted = search.query(
    text="connections without encryption",
    filter="encrypted=false"
)

Index Management

Create Indexes

from netintel_ocr.vector import IndexManager

index = IndexManager()

# Create IVF index for large collections
index.create_index(
    collection="network_components",
    index_type="IVF_FLAT",
    metric_type="L2",
    params={"nlist": 1024}
)

# Create HNSW index for high accuracy
index.create_index(
    collection="critical_components",
    index_type="HNSW",
    params={"M": 16, "efConstruction": 200}
)

Optimize Performance

# Compact collection
netintel-ocr db compact network_components

# Build index
netintel-ocr db build-index network_components

# Load collection to memory
netintel-ocr db load network_components

Integration Examples

CMDB Population

from netintel_ocr import DocumentProcessor
from cmdb_client import CMDBClient

# Process document
processor = DocumentProcessor()
results = processor.process("network_design.pdf")

# Populate CMDB
cmdb = CMDBClient()
for component in results.components:
    cmdb.create_ci({
        "name": component.name,
        "type": component.type,
        "attributes": component.metadata,
        "relationships": component.connections
    })

Change Detection

# Compare document versions
old_doc = search.get_document("design_v1.pdf")
new_doc = process_document("design_v2.pdf")

# Find changes
changes = search.compare_architectures(
    old_doc.embedding,
    new_doc.embedding
)

print(f"Added components: {changes.added}")
print(f"Removed components: {changes.removed}")
print(f"Modified connections: {changes.modified}")

Knowledge Graph

# Build knowledge graph
from netintel_ocr.graph import KnowledgeGraph

kg = KnowledgeGraph()

# Add documents to graph
for doc in documents:
    kg.add_document(doc)

# Query relationships
paths = kg.find_path(
    from_component="internet_gateway",
    to_component="database_server"
)

# Find critical paths
critical = kg.find_critical_paths(
    metric="security_exposure"
)

Monitoring

Collection Statistics

# View collection stats
netintel-ocr db stats --collection network_components

# Monitor query performance
netintel-ocr db query-stats --last 24h

Health Checks

# Check Milvus health
from netintel_ocr.vector import health_check

status = health_check()
print(f"Milvus status: {status.state}")
print(f"Collections: {status.collections}")
print(f"Total vectors: {status.total_vectors}")

Best Practices

  1. Collection Partitioning: Partition by document type or date
  2. Embedding Cache: Cache frequently accessed embeddings
  3. Batch Operations: Use batch insert for large datasets
  4. Index Selection: Choose index based on dataset size
  5. Regular Compaction: Compact collections weekly

Next Steps