mirror of
https://github.com/lone-cloud/gerbil
synced 2026-06-04 12:13:28 -07:00
122 lines
3.1 KiB
TypeScript
122 lines
3.1 KiB
TypeScript
import { gguf } from '@huggingface/gguf';
|
|
import { stat } from 'fs/promises';
|
|
|
|
interface VramCalculationParams {
|
|
modelPath: string;
|
|
contextSize: number;
|
|
availableVramGB: number;
|
|
flashAttention?: boolean;
|
|
}
|
|
|
|
function estimateContextVram(
|
|
contextSize: number,
|
|
layers: number,
|
|
embeddingLength: number,
|
|
flashAttention: boolean
|
|
) {
|
|
const bytesPerElement = 2;
|
|
let kvCacheSizeBytes =
|
|
2 * contextSize * layers * embeddingLength * bytesPerElement;
|
|
|
|
if (flashAttention) {
|
|
kvCacheSizeBytes *= 0.5;
|
|
}
|
|
|
|
const kvCacheSizeGB = kvCacheSizeBytes / 1024 ** 3;
|
|
|
|
return kvCacheSizeGB;
|
|
}
|
|
|
|
export async function calculateOptimalGpuLayers({
|
|
modelPath,
|
|
contextSize,
|
|
availableVramGB,
|
|
flashAttention = false,
|
|
}: VramCalculationParams) {
|
|
const isUrl =
|
|
modelPath.startsWith('http://') || modelPath.startsWith('https://');
|
|
|
|
let fileSize: number;
|
|
if (isUrl) {
|
|
const response = await fetch(modelPath, { method: 'HEAD' });
|
|
const contentLength = response.headers.get('content-length');
|
|
fileSize = contentLength ? parseInt(contentLength, 10) : 0;
|
|
} else {
|
|
const stats = await stat(modelPath);
|
|
fileSize = stats.size;
|
|
}
|
|
|
|
const multiPartMatch = modelPath.match(/-(\d{5})-of-(\d{5})\./);
|
|
if (multiPartMatch) {
|
|
const totalParts = parseInt(multiPartMatch[2], 10);
|
|
if (totalParts > 1 && totalParts <= 999) {
|
|
fileSize *= totalParts;
|
|
}
|
|
}
|
|
|
|
const { metadata } = await gguf(modelPath, {
|
|
allowLocalFile: !isUrl,
|
|
});
|
|
|
|
const metadataRecord = metadata as Record<string, unknown>;
|
|
|
|
const architecture =
|
|
(metadataRecord['general.architecture'] as string) || 'llama';
|
|
const totalLayers =
|
|
(metadataRecord[`${architecture}.block_count`] as number) || 32;
|
|
const embeddingLength =
|
|
(metadataRecord[`${architecture}.embedding_length`] as number) || 4096;
|
|
const headCount =
|
|
(metadataRecord[`${architecture}.attention.head_count`] as number) || 32;
|
|
const headCountKv =
|
|
(metadataRecord[`${architecture}.attention.head_count_kv`] as number) ||
|
|
headCount;
|
|
|
|
const headDim = embeddingLength / headCount;
|
|
const kvDim = headCountKv * headDim;
|
|
|
|
const modelSizeGB = fileSize / 1024 ** 3;
|
|
const vramPerLayerGB = modelSizeGB / totalLayers;
|
|
|
|
const headroomGB = 0.1;
|
|
const availableForModel = availableVramGB - headroomGB;
|
|
|
|
let recommendedLayers = 0;
|
|
let modelVramGB = 0;
|
|
let contextVramGB = 0;
|
|
|
|
for (let layers = 1; layers <= totalLayers; layers++) {
|
|
modelVramGB = layers * vramPerLayerGB;
|
|
contextVramGB = estimateContextVram(
|
|
contextSize,
|
|
layers,
|
|
kvDim,
|
|
flashAttention
|
|
);
|
|
const totalVram = modelVramGB + contextVramGB;
|
|
|
|
if (totalVram <= availableForModel) {
|
|
recommendedLayers = layers;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
const finalContextVram = estimateContextVram(
|
|
contextSize,
|
|
recommendedLayers,
|
|
kvDim,
|
|
flashAttention
|
|
);
|
|
const estimatedVramUsageGB =
|
|
recommendedLayers * vramPerLayerGB + finalContextVram;
|
|
|
|
return {
|
|
recommendedLayers,
|
|
totalLayers,
|
|
estimatedVramUsageGB,
|
|
modelVramGB: recommendedLayers * vramPerLayerGB,
|
|
contextVramGB: finalContextVram,
|
|
headroomGB,
|
|
};
|
|
}
|