What You’ll Build
A complete invoice extraction pipeline that:- Creates a dedicated workspace
- Defines a template to extract invoice data
- Processes multiple invoices with auto-extraction
- Retrieves structured results
Prerequisites
- A Raydocs API token with full permissions (
workspaces-write,templates-write,sessions-write) - Invoice documents to process (PDF format)
Complete Implementation
- Python
- TypeScript
- PHP
Copy
import time
from raydocs_client import RaydocsClient
def main():
# Initialize client
client = RaydocsClient("your_api_token")
# ═══════════════════════════════════════════════════════════
# STEP 1: Create a Workspace
# ═══════════════════════════════════════════════════════════
print("📁 Setting up workspace...")
# Check for existing workspaces
workspaces = client.list_workspaces()
if workspaces:
# Use existing workspace
workspace = workspaces[0]
print(f" Using existing workspace: {workspace['name']}")
else:
# Create new workspace
workspace = client.create_workspace(
name="Invoice Processing",
icon="📄"
)
print(f" Created workspace: {workspace['name']}")
workspace_id = workspace['id']
# ═══════════════════════════════════════════════════════════
# STEP 2: Create an Extraction Template
# ═══════════════════════════════════════════════════════════
print("\n📋 Creating extraction template...")
invoice_schema = {
"config": {
"reasoning_enabled": True,
"system_message": "You are extracting data from invoices. Be precise with numbers and dates. Return null for any field not found in the document."
},
"groups": {
# Group 1: Basic invoice information
"invoice_header": {
"search_query": "invoice number, invoice date, due date, total amount, subtotal, tax",
"fields": {
"invoice_number": {
"type": "string",
"extraction_prompt": "Extract the invoice number or invoice ID"
},
"invoice_date": {
"type": "string",
"extraction_prompt": "Extract the invoice date in YYYY-MM-DD format"
},
"due_date": {
"type": "string",
"extraction_prompt": "Extract the payment due date in YYYY-MM-DD format"
},
"subtotal": {
"type": "number",
"extraction_prompt": "Extract the subtotal amount before tax"
},
"tax_amount": {
"type": "number",
"extraction_prompt": "Extract the tax amount"
},
"total_amount": {
"type": "number",
"extraction_prompt": "Extract the total amount due"
},
"currency": {
"type": "string",
"extraction_prompt": "Extract the currency code (USD, EUR, GBP, etc.)"
}
}
},
# Group 2: Vendor/Supplier information
"vendor_info": {
"search_query": "vendor, supplier, seller, company name, business name, from, bill from",
"fields": {
"vendor_name": {
"type": "string",
"extraction_prompt": "Extract the vendor or supplier company name"
},
"vendor_address": {
"type": "string",
"extraction_prompt": "Extract the vendor's full address"
},
"vendor_email": {
"type": "string",
"extraction_prompt": "Extract the vendor's email address"
},
"vendor_phone": {
"type": "string",
"extraction_prompt": "Extract the vendor's phone number"
}
}
},
# Group 3: Customer/Billing information
"customer_info": {
"search_query": "bill to, customer, client, buyer, ship to",
"fields": {
"customer_name": {
"type": "string",
"extraction_prompt": "Extract the customer or client name"
},
"customer_address": {
"type": "string",
"extraction_prompt": "Extract the customer's billing address"
}
}
},
# Group 4: Line items
"line_items": {
"search_query": "items, products, services, description, quantity, price, amount",
"fields": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": {"type": "string"},
"quantity": {"type": "number"},
"unit_price": {"type": "number"},
"total": {"type": "number"}
}
},
"extraction_prompt": "Extract all line items with description, quantity, unit price, and line total"
}
}
},
# Group 5: Payment information
"payment_info": {
"search_query": "payment terms, bank account, wire transfer, payment method",
"fields": {
"payment_terms": {
"type": "string",
"extraction_prompt": "Extract payment terms (e.g., Net 30, Due on receipt)"
},
"bank_name": {
"type": "string",
"extraction_prompt": "Extract the bank name for wire transfers"
},
"account_number": {
"type": "string",
"extraction_prompt": "Extract the bank account number (mask middle digits for security)"
}
}
}
}
}
template = client.create_template(
workspace_id=workspace_id,
name="Invoice Extractor v1",
description="Comprehensive invoice data extraction template",
schema=invoice_schema
)
template_id = template['id']
print(f" Created template: {template['name']}")
print(f" Template ID: {template_id}")
# ═══════════════════════════════════════════════════════════
# STEP 3: Upload Documents
# ═══════════════════════════════════════════════════════════
print("\n📤 Uploading documents...")
documents = [
"invoices/acme-corp-001.pdf",
"invoices/globex-002.pdf",
"invoices/initech-003.pdf"
]
file_keys = []
for doc in documents:
print(f" Uploading: {doc}")
key = client.upload_file(doc)
file_keys.append(key)
print(f" ✓ Uploaded {len(file_keys)} documents")
# ═══════════════════════════════════════════════════════════
# STEP 4: Create Sessions with Auto-Extract
# ═══════════════════════════════════════════════════════════
print("\n🚀 Starting extraction...")
sessions = client.batch_create_sessions(
template_id=template_id,
file_keys=file_keys,
auto_extract=True
)
print(f" Created {len(sessions)} sessions with auto-extraction enabled")
# ═══════════════════════════════════════════════════════════
# STEP 5: Poll for Results
# ═══════════════════════════════════════════════════════════
print("\n⏳ Processing documents...")
extracted_data = []
for i, session in enumerate(sessions):
doc_name = documents[i].split('/')[-1]
print(f"\n Processing: {doc_name}")
# Poll until extraction completes
while True:
results = client.get_results(session['id'])
if results:
result = results[0]
if result['status'] == 'completed':
# Get full result data
full_result = client.get_result(result['id'])
print(f" ✅ Success!")
# Extract key information for summary
header = full_result['data'].get('invoice_header', {})
vendor = full_result['data'].get('vendor_info', {})
invoice_summary = {
"document": doc_name,
"invoice_number": header.get('invoice_number'),
"vendor": vendor.get('vendor_name'),
"total": header.get('total_amount'),
"currency": header.get('currency'),
"due_date": header.get('due_date'),
"full_data": full_result['data']
}
extracted_data.append(invoice_summary)
print(f" Invoice #: {invoice_summary['invoice_number']}")
print(f" Vendor: {invoice_summary['vendor']}")
print(f" Total: {invoice_summary['currency']} {invoice_summary['total']}")
break
elif result['status'] == 'failed':
print(f" ❌ Extraction failed")
extracted_data.append({"document": doc_name, "status": "error"})
break
time.sleep(5) # Poll every 5 seconds
# ═══════════════════════════════════════════════════════════
# STEP 6: Summary
# ═══════════════════════════════════════════════════════════
print("\n" + "═" * 50)
print("📊 EXTRACTION SUMMARY")
print("═" * 50)
successful = [d for d in extracted_data if 'full_data' in d]
print(f"\nProcessed: {len(successful)}/{len(documents)} documents")
if successful:
total_value = sum(
d.get('total', 0) or 0
for d in successful
)
print(f"Total invoice value: ${total_value:,.2f}")
print("\nInvoices extracted:")
for inv in successful:
print(f" • {inv['invoice_number']} - {inv['vendor']} - ${inv['total']:,.2f}")
return extracted_data
if __name__ == "__main__":
results = main()
Copy
import { RaydocsClient } from './raydocs-client';
interface InvoiceSummary {
document: string;
invoiceNumber?: string;
vendor?: string;
total?: number;
currency?: string;
dueDate?: string;
fullData?: any;
status?: string;
error?: string;
}
async function main(): Promise<InvoiceSummary[]> {
const client = new RaydocsClient('your_api_token');
// ═══════════════════════════════════════════════════════════
// STEP 1: Create a Workspace
// ═══════════════════════════════════════════════════════════
console.log('📁 Setting up workspace...');
let workspaces = await client.listWorkspaces();
let workspace;
if (workspaces.length > 0) {
workspace = workspaces[0];
console.log(` Using existing workspace: ${workspace.name}`);
} else {
workspace = await client.createWorkspace('Invoice Processing', '📄');
console.log(` Created workspace: ${workspace.name}`);
}
// ═══════════════════════════════════════════════════════════
// STEP 2: Create an Extraction Template
// ═══════════════════════════════════════════════════════════
console.log('\n📋 Creating extraction template...');
const invoiceSchema = {
config: {
reasoning_enabled: true,
system_message: 'Extract invoice data precisely. Return null for missing fields.',
},
groups: {
invoice_header: {
search_query: 'invoice number, date, total amount, tax',
fields: {
invoice_number: {
type: 'string',
extraction_prompt: 'Extract the invoice number',
},
invoice_date: {
type: 'string',
extraction_prompt: 'Extract date in YYYY-MM-DD format',
},
total_amount: {
type: 'number',
extraction_prompt: 'Extract the total amount',
},
currency: {
type: 'string',
extraction_prompt: 'Extract currency code (USD, EUR, etc.)',
},
},
},
vendor_info: {
search_query: 'vendor, supplier, company name, from',
fields: {
vendor_name: {
type: 'string',
extraction_prompt: 'Extract the vendor company name',
},
vendor_address: {
type: 'string',
extraction_prompt: 'Extract the vendor address',
},
},
},
line_items: {
search_query: 'items, products, description, quantity, price',
fields: {
items: {
type: 'array',
items: {
type: 'object',
properties: {
description: { type: 'string' },
quantity: { type: 'number' },
unit_price: { type: 'number' },
total: { type: 'number' },
},
},
extraction_prompt: 'Extract all line items',
},
},
},
},
};
const template = await client.createTemplate(
workspace.id,
'Invoice Extractor v1',
invoiceSchema,
'Comprehensive invoice extraction'
);
console.log(` Created template: ${template.name}`);
console.log(` Template ID: ${template.id}`);
// ═══════════════════════════════════════════════════════════
// STEP 3: Upload Documents
// ═══════════════════════════════════════════════════════════
console.log('\n📤 Uploading documents...');
const documents = [
'invoices/acme-corp-001.pdf',
'invoices/globex-002.pdf',
'invoices/initech-003.pdf',
];
const fileKeys: string[] = [];
for (const doc of documents) {
console.log(` Uploading: ${doc}`);
const key = await client.uploadFile(doc);
fileKeys.push(key);
}
console.log(` ✓ Uploaded ${fileKeys.length} documents`);
// ═══════════════════════════════════════════════════════════
// STEP 4: Create Sessions with Auto-Extract
// ═══════════════════════════════════════════════════════════
console.log('\n🚀 Starting extraction...');
const sessions = await client.batchCreateSessions(template.id, fileKeys, true);
console.log(` Created ${sessions.length} sessions`);
// ═══════════════════════════════════════════════════════════
// STEP 5: Poll for Results
// ═══════════════════════════════════════════════════════════
console.log('\n⏳ Processing documents...');
const extractedData: InvoiceSummary[] = [];
for (let i = 0; i < sessions.length; i++) {
const docName = documents[i].split('/').pop() || documents[i];
console.log(`\n Processing: ${docName}`);
// Poll until extraction completes
while (true) {
const results = await client.getResults(sessions[i].id);
if (results.length > 0) {
const result = results[0];
if (result.status === 'completed') {
const fullResult = await client.getResult(result.id);
console.log(' ✅ Success!');
const header = fullResult.data.invoice_header || {};
const vendor = fullResult.data.vendor_info || {};
const summary: InvoiceSummary = {
document: docName,
invoiceNumber: header.invoice_number,
vendor: vendor.vendor_name,
total: header.total_amount,
currency: header.currency,
dueDate: header.due_date,
fullData: fullResult.data,
};
extractedData.push(summary);
console.log(` Invoice #: ${summary.invoiceNumber}`);
console.log(` Vendor: ${summary.vendor}`);
console.log(` Total: ${summary.currency} ${summary.total}`);
break;
} else if (result.status === 'failed') {
console.log(' ❌ Extraction failed');
extractedData.push({ document: docName, status: 'error' });
break;
}
}
await new Promise(r => setTimeout(r, 5000)); // Poll every 5 seconds
}
}
// ═══════════════════════════════════════════════════════════
// STEP 6: Summary
// ═══════════════════════════════════════════════════════════
console.log('\n' + '═'.repeat(50));
console.log('📊 EXTRACTION SUMMARY');
console.log('═'.repeat(50));
const successful = extractedData.filter((d) => d.fullData);
console.log(`\nProcessed: ${successful.length}/${documents.length} documents`);
if (successful.length > 0) {
const totalValue = successful.reduce((sum, d) => sum + (d.total || 0), 0);
console.log(`Total invoice value: $${totalValue.toLocaleString()}`);
}
return extractedData;
}
main().catch(console.error);
Copy
<?php
require_once 'vendor/autoload.php';
require_once 'RaydocsClient.php';
function main(): array
{
// Initialize client
$client = new RaydocsClient('your_api_token');
// ═══════════════════════════════════════════════════════════
// STEP 1: Create a Workspace
// ═══════════════════════════════════════════════════════════
echo "📁 Setting up workspace...\n";
$workspaces = $client->listWorkspaces();
if (!empty($workspaces)) {
$workspace = $workspaces[0];
echo " Using existing workspace: {$workspace['name']}\n";
} else {
$workspace = $client->createWorkspace('Invoice Processing', '📄');
echo " Created workspace: {$workspace['name']}\n";
}
$workspaceId = $workspace['id'];
// ═══════════════════════════════════════════════════════════
// STEP 2: Create an Extraction Template
// ═══════════════════════════════════════════════════════════
echo "\n📋 Creating extraction template...\n";
$invoiceSchema = [
'config' => [
'reasoning_enabled' => true,
'system_message' => 'Extract invoice data precisely. Return null for missing fields.',
],
'groups' => [
'invoice_header' => [
'search_query' => 'invoice number, date, total amount, tax',
'fields' => [
'invoice_number' => [
'type' => 'string',
'extraction_prompt' => 'Extract the invoice number',
],
'invoice_date' => [
'type' => 'string',
'extraction_prompt' => 'Extract date in YYYY-MM-DD format',
],
'total_amount' => [
'type' => 'number',
'extraction_prompt' => 'Extract the total amount',
],
'currency' => [
'type' => 'string',
'extraction_prompt' => 'Extract currency code (USD, EUR, etc.)',
],
],
],
'vendor_info' => [
'search_query' => 'vendor, supplier, company name, from',
'fields' => [
'vendor_name' => [
'type' => 'string',
'extraction_prompt' => 'Extract the vendor company name',
],
'vendor_address' => [
'type' => 'string',
'extraction_prompt' => 'Extract the vendor address',
],
],
],
'line_items' => [
'search_query' => 'items, products, description, quantity, price',
'fields' => [
'items' => [
'type' => 'array',
'items' => [
'type' => 'object',
'properties' => [
'description' => ['type' => 'string'],
'quantity' => ['type' => 'number'],
'unit_price' => ['type' => 'number'],
'total' => ['type' => 'number'],
],
],
'extraction_prompt' => 'Extract all line items',
],
],
],
],
];
$template = $client->createTemplate(
$workspaceId,
'Invoice Extractor v1',
$invoiceSchema,
'Comprehensive invoice extraction'
);
echo " Created template: {$template['name']}\n";
echo " Template ID: {$template['id']}\n";
// ═══════════════════════════════════════════════════════════
// STEP 3: Upload Documents
// ═══════════════════════════════════════════════════════════
echo "\n📤 Uploading documents...\n";
$documents = [
'invoices/acme-corp-001.pdf',
'invoices/globex-002.pdf',
'invoices/initech-003.pdf',
];
$fileKeys = [];
foreach ($documents as $doc) {
echo " Uploading: {$doc}\n";
$fileKeys[] = $client->uploadFile($doc);
}
echo " ✓ Uploaded " . count($fileKeys) . " documents\n";
// ═══════════════════════════════════════════════════════════
// STEP 4: Create Sessions with Auto-Extract
// ═══════════════════════════════════════════════════════════
echo "\n🚀 Starting extraction...\n";
$sessions = $client->batchCreateSessions($template['id'], $fileKeys, true);
echo " Created " . count($sessions) . " sessions\n";
// ═══════════════════════════════════════════════════════════
// STEP 5: Poll for Results
// ═══════════════════════════════════════════════════════════
echo "\n⏳ Processing documents...\n";
$extractedData = [];
foreach ($sessions as $i => $session) {
$docName = basename($documents[$i]);
echo "\n Processing: {$docName}\n";
// Poll until extraction completes
while (true) {
$results = $client->getResults($session['id']);
if (!empty($results)) {
$result = $results[0];
if ($result['status'] === 'completed') {
$fullResult = $client->getResult($result['id']);
echo " ✅ Success!\n";
$header = $fullResult['data']['invoice_header'] ?? [];
$vendor = $fullResult['data']['vendor_info'] ?? [];
$summary = [
'document' => $docName,
'invoiceNumber' => $header['invoice_number'] ?? null,
'vendor' => $vendor['vendor_name'] ?? null,
'total' => $header['total_amount'] ?? null,
'currency' => $header['currency'] ?? null,
'fullData' => $fullResult['data'],
];
$extractedData[] = $summary;
echo " Invoice #: {$summary['invoiceNumber']}\n";
echo " Vendor: {$summary['vendor']}\n";
echo " Total: {$summary['currency']} {$summary['total']}\n";
break;
} elseif ($result['status'] === 'failed') {
echo " ❌ Extraction failed\n";
$extractedData[] = ['document' => $docName, 'status' => 'error'];
break;
}
}
sleep(5); // Poll every 5 seconds
}
}
// ═══════════════════════════════════════════════════════════
// STEP 6: Summary
// ═══════════════════════════════════════════════════════════
echo "\n" . str_repeat('═', 50) . "\n";
echo "📊 EXTRACTION SUMMARY\n";
echo str_repeat('═', 50) . "\n";
$successful = array_filter($extractedData, fn($d) => isset($d['fullData']));
echo "\nProcessed: " . count($successful) . "/" . count($documents) . " documents\n";
if (!empty($successful)) {
$totalValue = array_sum(array_map(fn($d) => $d['total'] ?? 0, $successful));
echo "Total invoice value: $" . number_format($totalValue, 2) . "\n";
}
return $extractedData;
}
$results = main();
Schema Design Tips
Group Organization
Organize fields into logical groups based on where they appear in documents:| Group | Purpose | Search Query Tips |
|---|---|---|
invoice_header | Core invoice data | ”invoice, number, date, total, amount” |
vendor_info | Seller details | ”vendor, supplier, from, seller, company” |
customer_info | Buyer details | ”bill to, customer, client, ship to” |
line_items | Products/services | ”items, description, quantity, price” |
payment_info | Payment details | ”payment, bank, terms, wire, account” |
Field Types
Copy
{
"string_field": { "type": "string" },
"number_field": { "type": "number" },
"boolean_field": { "type": "boolean" },
"array_field": {
"type": "array",
"items": { "type": "string" }
},
"object_field": {
"type": "object",
"properties": {
"nested": { "type": "string" }
}
}
}
Extraction Prompts
Write clear, specific prompts:- Good Prompts
- Bad Prompts
Copy
{
"extraction_prompt": "Extract the invoice date in YYYY-MM-DD format"
}
{
"extraction_prompt": "Extract the total amount as a number without currency symbols"
}
{
"extraction_prompt": "Extract all line items with description, quantity, unit price, and line total"
}
Copy
// Too vague
{ "extraction_prompt": "Get the date" }
// No format specified
{ "extraction_prompt": "Find the amount" }
// Missing details
{ "extraction_prompt": "Extract items" }
Handling Results
Accessing Nested Data
Copy
# Access grouped data
header = result['data']['invoice_header']
invoice_num = header.get('invoice_number')
total = header.get('total_amount')
# Access arrays
items = result['data']['line_items']['items']
for item in items:
print(f"{item['description']}: ${item['total']}")
# Access reasoning (if enabled)
reasoning = result.get('reasoning', {})
confidence = reasoning.get('invoice_header', {}).get('invoice_number', {}).get('confidence')
Exporting Results
Copy
import json
import csv
# Export to JSON
with open('results.json', 'w') as f:
json.dump(extracted_data, f, indent=2)
# Export to CSV (flattened)
with open('results.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'document', 'invoice_number', 'vendor', 'total', 'currency'
])
writer.writeheader()
for inv in extracted_data:
if inv.get('full_data'):
writer.writerow({
'document': inv['document'],
'invoice_number': inv['invoice_number'],
'vendor': inv['vendor'],
'total': inv['total'],
'currency': inv['currency']
})
