import OpenAI from "openai"
const openai = new OpenAI({
baseURL: "https://api.aiapilab.com/v1",
apiKey: $AIAPILAB_API_KEY
})
async function main() {
const completion = await openai.chat.completions.create({
model: "meta-llama/llama-3.2-11b-vision-instruct",
messages: [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
]
}
]
})
console.log(completion.choices[0].message)
}
main()
Feature/Aspect | Model A (e.g., GPT-4) | Model B (e.g., Google Gemini) | Meta: Llama 3.2 11B Vision Instruct |
---|---|---|---|
Parameters | 175 billion | 70 billion | 11 billion |
Training Data | Trained on diverse datasets, including web text | Trained on large-scale multilingual datasets | Pretrained on 6 billion image-text pairs |
Context Length | Up to 32k tokens | Up to 64k tokens | Up to 128k tokens |
Input Modality | Text only | Text + Image | Text + Image |
Primary Use Cases | General Text Generation, Conversational AI | Multimodal Tasks, Visual Reasoning | Visual Question Answering, Image Captioning |