Skip to main content
This guide walks through the complete Raydocs workflow from scratch: creating a workspace, designing an extraction template, uploading documents, and retrieving results.

What You’ll Build

A complete invoice extraction pipeline that:
  1. Creates a dedicated workspace
  2. Defines a template to extract invoice data
  3. Processes multiple invoices with auto-extraction
  4. Retrieves structured results

Prerequisites

  • A Raydocs API token with full permissions (workspaces-write, templates-write, sessions-write)
  • Invoice documents to process (PDF format)

Complete Implementation

import time
from raydocs_client import RaydocsClient

def main():
    # Initialize client
    client = RaydocsClient("your_api_token")
    
    # ═══════════════════════════════════════════════════════════
    # STEP 1: Create a Workspace
    # ═══════════════════════════════════════════════════════════
    print("📁 Setting up workspace...")
    
    # Check for existing workspaces
    workspaces = client.list_workspaces()
    
    if workspaces:
        # Use existing workspace
        workspace = workspaces[0]
        print(f"   Using existing workspace: {workspace['name']}")
    else:
        # Create new workspace
        workspace = client.create_workspace(
            name="Invoice Processing",
            icon="📄"
        )
        print(f"   Created workspace: {workspace['name']}")
    
    workspace_id = workspace['id']
    
    # ═══════════════════════════════════════════════════════════
    # STEP 2: Create an Extraction Template
    # ═══════════════════════════════════════════════════════════
    print("\n📋 Creating extraction template...")
    
    invoice_schema = {
        "config": {
            "reasoning_enabled": True,
            "system_message": "You are extracting data from invoices. Be precise with numbers and dates. Return null for any field not found in the document."
        },
        "groups": {
            # Group 1: Basic invoice information
            "invoice_header": {
                "search_query": "invoice number, invoice date, due date, total amount, subtotal, tax",
                "fields": {
                    "invoice_number": {
                        "type": "string",
                        "extraction_prompt": "Extract the invoice number or invoice ID"
                    },
                    "invoice_date": {
                        "type": "string",
                        "extraction_prompt": "Extract the invoice date in YYYY-MM-DD format"
                    },
                    "due_date": {
                        "type": "string",
                        "extraction_prompt": "Extract the payment due date in YYYY-MM-DD format"
                    },
                    "subtotal": {
                        "type": "number",
                        "extraction_prompt": "Extract the subtotal amount before tax"
                    },
                    "tax_amount": {
                        "type": "number",
                        "extraction_prompt": "Extract the tax amount"
                    },
                    "total_amount": {
                        "type": "number",
                        "extraction_prompt": "Extract the total amount due"
                    },
                    "currency": {
                        "type": "string",
                        "extraction_prompt": "Extract the currency code (USD, EUR, GBP, etc.)"
                    }
                }
            },
            
            # Group 2: Vendor/Supplier information
            "vendor_info": {
                "search_query": "vendor, supplier, seller, company name, business name, from, bill from",
                "fields": {
                    "vendor_name": {
                        "type": "string",
                        "extraction_prompt": "Extract the vendor or supplier company name"
                    },
                    "vendor_address": {
                        "type": "string",
                        "extraction_prompt": "Extract the vendor's full address"
                    },
                    "vendor_email": {
                        "type": "string",
                        "extraction_prompt": "Extract the vendor's email address"
                    },
                    "vendor_phone": {
                        "type": "string",
                        "extraction_prompt": "Extract the vendor's phone number"
                    }
                }
            },
            
            # Group 3: Customer/Billing information
            "customer_info": {
                "search_query": "bill to, customer, client, buyer, ship to",
                "fields": {
                    "customer_name": {
                        "type": "string",
                        "extraction_prompt": "Extract the customer or client name"
                    },
                    "customer_address": {
                        "type": "string",
                        "extraction_prompt": "Extract the customer's billing address"
                    }
                }
            },
            
            # Group 4: Line items
            "line_items": {
                "search_query": "items, products, services, description, quantity, price, amount",
                "fields": {
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "description": {"type": "string"},
                                "quantity": {"type": "number"},
                                "unit_price": {"type": "number"},
                                "total": {"type": "number"}
                            }
                        },
                        "extraction_prompt": "Extract all line items with description, quantity, unit price, and line total"
                    }
                }
            },
            
            # Group 5: Payment information
            "payment_info": {
                "search_query": "payment terms, bank account, wire transfer, payment method",
                "fields": {
                    "payment_terms": {
                        "type": "string",
                        "extraction_prompt": "Extract payment terms (e.g., Net 30, Due on receipt)"
                    },
                    "bank_name": {
                        "type": "string",
                        "extraction_prompt": "Extract the bank name for wire transfers"
                    },
                    "account_number": {
                        "type": "string",
                        "extraction_prompt": "Extract the bank account number (mask middle digits for security)"
                    }
                }
            }
        }
    }
    
    template = client.create_template(
        workspace_id=workspace_id,
        name="Invoice Extractor v1",
        description="Comprehensive invoice data extraction template",
        schema=invoice_schema
    )
    
    template_id = template['id']
    print(f"   Created template: {template['name']}")
    print(f"   Template ID: {template_id}")
    
    # ═══════════════════════════════════════════════════════════
    # STEP 3: Upload Documents
    # ═══════════════════════════════════════════════════════════
    print("\n📤 Uploading documents...")
    
    documents = [
        "invoices/acme-corp-001.pdf",
        "invoices/globex-002.pdf",
        "invoices/initech-003.pdf"
    ]
    
    file_keys = []
    for doc in documents:
        print(f"   Uploading: {doc}")
        key = client.upload_file(doc)
        file_keys.append(key)
    
    print(f"   ✓ Uploaded {len(file_keys)} documents")
    
    # ═══════════════════════════════════════════════════════════
    # STEP 4: Create Sessions with Auto-Extract
    # ═══════════════════════════════════════════════════════════
    print("\n🚀 Starting extraction...")
    
    sessions = client.batch_create_sessions(
        template_id=template_id,
        file_keys=file_keys,
        auto_extract=True
    )
    
    print(f"   Created {len(sessions)} sessions with auto-extraction enabled")
    
    # ═══════════════════════════════════════════════════════════
    # STEP 5: Poll for Results
    # ═══════════════════════════════════════════════════════════
    print("\n⏳ Processing documents...")
    
    extracted_data = []
    
    for i, session in enumerate(sessions):
        doc_name = documents[i].split('/')[-1]
        print(f"\n   Processing: {doc_name}")
        
        # Poll until extraction completes
        while True:
            results = client.get_results(session['id'])
            
            if results:
                result = results[0]
                if result['status'] == 'completed':
                    # Get full result data
                    full_result = client.get_result(result['id'])
                    print(f"   ✅ Success!")
                    
                    # Extract key information for summary
                    header = full_result['data'].get('invoice_header', {})
                    vendor = full_result['data'].get('vendor_info', {})
                    
                    invoice_summary = {
                        "document": doc_name,
                        "invoice_number": header.get('invoice_number'),
                        "vendor": vendor.get('vendor_name'),
                        "total": header.get('total_amount'),
                        "currency": header.get('currency'),
                        "due_date": header.get('due_date'),
                        "full_data": full_result['data']
                    }
                    
                    extracted_data.append(invoice_summary)
                    
                    print(f"      Invoice #: {invoice_summary['invoice_number']}")
                    print(f"      Vendor: {invoice_summary['vendor']}")
                    print(f"      Total: {invoice_summary['currency']} {invoice_summary['total']}")
                    break
                    
                elif result['status'] == 'failed':
                    print(f"   ❌ Extraction failed")
                    extracted_data.append({"document": doc_name, "status": "error"})
                    break
            
            time.sleep(5)  # Poll every 5 seconds
    
    # ═══════════════════════════════════════════════════════════
    # STEP 6: Summary
    # ═══════════════════════════════════════════════════════════
    print("\n" + "═" * 50)
    print("📊 EXTRACTION SUMMARY")
    print("═" * 50)
    
    successful = [d for d in extracted_data if 'full_data' in d]
    print(f"\nProcessed: {len(successful)}/{len(documents)} documents")
    
    if successful:
        total_value = sum(
            d.get('total', 0) or 0 
            for d in successful
        )
        print(f"Total invoice value: ${total_value:,.2f}")
        
        print("\nInvoices extracted:")
        for inv in successful:
            print(f"  • {inv['invoice_number']} - {inv['vendor']} - ${inv['total']:,.2f}")
    
    return extracted_data


if __name__ == "__main__":
    results = main()

Schema Design Tips

Group Organization

Organize fields into logical groups based on where they appear in documents:
GroupPurposeSearch Query Tips
invoice_headerCore invoice data”invoice, number, date, total, amount”
vendor_infoSeller details”vendor, supplier, from, seller, company”
customer_infoBuyer details”bill to, customer, client, ship to”
line_itemsProducts/services”items, description, quantity, price”
payment_infoPayment details”payment, bank, terms, wire, account”

Field Types

{
  "string_field": { "type": "string" },
  "number_field": { "type": "number" },
  "boolean_field": { "type": "boolean" },
  "array_field": {
    "type": "array",
    "items": { "type": "string" }
  },
  "object_field": {
    "type": "object",
    "properties": {
      "nested": { "type": "string" }
    }
  }
}

Extraction Prompts

Write clear, specific prompts:
{
  "extraction_prompt": "Extract the invoice date in YYYY-MM-DD format"
}

{
  "extraction_prompt": "Extract the total amount as a number without currency symbols"
}

{
  "extraction_prompt": "Extract all line items with description, quantity, unit price, and line total"
}

Handling Results

Accessing Nested Data

# Access grouped data
header = result['data']['invoice_header']
invoice_num = header.get('invoice_number')
total = header.get('total_amount')

# Access arrays
items = result['data']['line_items']['items']
for item in items:
    print(f"{item['description']}: ${item['total']}")

# Access reasoning (if enabled)
reasoning = result.get('reasoning', {})
confidence = reasoning.get('invoice_header', {}).get('invoice_number', {}).get('confidence')

Exporting Results

import json
import csv

# Export to JSON
with open('results.json', 'w') as f:
    json.dump(extracted_data, f, indent=2)

# Export to CSV (flattened)
with open('results.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=[
        'document', 'invoice_number', 'vendor', 'total', 'currency'
    ])
    writer.writeheader()
    for inv in extracted_data:
        if inv.get('full_data'):
            writer.writerow({
                'document': inv['document'],
                'invoice_number': inv['invoice_number'],
                'vendor': inv['vendor'],
                'total': inv['total'],
                'currency': inv['currency']
            })

Next Steps