Embeddings
Optimization
Optimization
Batch Processing for Efficiency
When processing many documents, itβs more efficient to batch them:
<?php
use Cognesy\Polyglot\Embeddings\Embeddings;
$embeddings = new Embeddings();
$allDocuments = [/* large array of documents */];
// Process in batches of 25 (check provider-specific limits)
$batchSize = 25;
$vectors = [];
for ($i = 0; $i < count($allDocuments); $i += $batchSize) {
$batch = array_slice($allDocuments, $i, $batchSize);
try {
$response = $embeddings->create($batch);
$batchVectors = $response->toValuesArray();
// Add to our vectors array
$vectors = array_merge($vectors, $batchVectors);
echo "Processed batch " . (floor($i / $batchSize) + 1) . " of " . ceil(count($allDocuments) / $batchSize) . "\n";
} catch (\Exception $e) {
echo "Error processing batch: " . $e->getMessage() . "\n";
}
// Optional: Add a small delay to avoid hitting rate limits
usleep(100000); // 100ms
}
echo "Processed " . count($vectors) . " embeddings in total.\n";
Caching Embeddings
For better performance, you can cache embeddings to avoid regenerating them:
<?php
use Cognesy\Polyglot\Embeddings\Embeddings;
class CachedEmbeddings {
private $embeddings;
private $cache = [];
public function __construct(Embeddings $embeddings = null) {
$this->embeddings = $embeddings ?? new Embeddings();
}
public function create($input, array $options = []): array {
if (is_string($input)) {
// Single string input
$cacheKey = $this->getCacheKey($input, $options);
if (isset($this->cache[$cacheKey])) {
return $this->cache[$cacheKey];
}
$response = $this->embeddings->create($input, $options);
$vector = $response->first()->values();
$this->cache[$cacheKey] = $vector;
return $vector;
} else {
// Array of strings
$results = [];
$uncachedInputs = [];
$uncachedIndices = [];
// Check cache for each input
foreach ($input as $i => $text) {
$cacheKey = $this->getCacheKey($text, $options);
if (isset($this->cache[$cacheKey])) {
$results[$i] = $this->cache[$cacheKey];
} else {
$uncachedInputs[] = $text;
$uncachedIndices[] = $i;
}
}
// Generate embeddings for uncached inputs
if (!empty($uncachedInputs)) {
$response = $this->embeddings->create($uncachedInputs, $options);
$vectors = $response->toValuesArray();
foreach ($vectors as $j => $vector) {
$i = $uncachedIndices[$j];
$results[$i] = $vector;
// Update cache
$cacheKey = $this->getCacheKey($input[$i], $options);
$this->cache[$cacheKey] = $vector;
}
}
// Sort by original indices
ksort($results);
return $results;
}
}
private function getCacheKey(string $input, array $options): string {
$model = $options['model'] ?? '';
return md5($input . serialize($options) . $model);
}
}
// Usage
$cachedEmbeddings = new CachedEmbeddings(new Embeddings('openai'));
// First call will generate embeddings
$vector1 = $cachedEmbeddings->create("This is a test");
echo "First call completed, generated vector with " . count($vector1) . " dimensions.\n";
// Second call will use the cache
$vector2 = $cachedEmbeddings->create("This is a test");
echo "Second call completed (from cache).\n";
// Compare vectors to verify they're the same
$equal = (serialize($vector1) === serialize($vector2));
echo "Vectors are " . ($equal ? "identical" : "different") . ".\n";