mSTEB Leaderboard

This leaderboard has the results of evaluation of models on mSTEB benchmark.

Select Region
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "Gemini 2.0 Flash",
      • 81.18,
      • 90.5,
      • 83.5,
      • 83.4,
      • 67.3,
      • 57,
      • 42.7
      ],
    • [
      • "GPT-4o",
      • 79.3,
      • 86.4,
      • 80.6,
      • 82.3,
      • 67.9,
      • 53.3,
      • 40.7
      ],
    • [
      • "Gemma 3 27B",
      • 74.93,
      • 81,
      • 75.8,
      • 77.5,
      • 65.4,
      • 49.7,
      • 36.4
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 66.03,
      • 70.6,
      • 64.4,
      • 69.6,
      • 59.5,
      • 47.3,
      • 26
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "Gemini 2.0 Flash",
      • 74.03,
      • 80,
      • 74.9,
      • 77,
      • 64.2,
      • 46.2,
      • 32
      ],
    • [
      • "GPT-4o",
      • 67.8,
      • 69.6,
      • 69.6,
      • 70.2,
      • 61.8,
      • 41,
      • 30.5
      ],
    • [
      • "Gemma 3 27B",
      • 60.45,
      • 62.9,
      • 59.9,
      • 60.6,
      • 58.4,
      • 34.8,
      • 24
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 50.85,
      • 52.5,
      • 47.7,
      • 51.2,
      • 52,
      • 32.7,
      • 13.8
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "Gemini 2.0 Flash",
      • 77.47,
      • 99.5,
      • 82.9,
      • 80.7,
      • 46.8,
      • 52.6,
      • 42.9
      ],
    • [
      • "GPT-4o",
      • 76.85,
      • 99.6,
      • 80.5,
      • 78.2,
      • 49.1,
      • 49,
      • 40.8
      ],
    • [
      • "Gemma 3 27B",
      • 64.28,
      • 79.2,
      • 69.5,
      • 62.3,
      • 46.1,
      • 41.2,
      • 32.4
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 58.53,
      • 81,
      • 58.3,
      • 53.5,
      • 41.3,
      • 38.2,
      • 16
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "Gemini 2.0 Flash",
      • 82.5,
      • 87.2,
      • 86.7,
      • 84.7,
      • 71.4,
      • 59.6,
      • 39.2
      ],
    • [
      • "GPT-4o",
      • 80.18,
      • 83.2,
      • 82.1,
      • 83,
      • 72.4,
      • 54.2,
      • 34.2
      ],
    • [
      • "Gemma 3 27B",
      • 77.35,
      • 78.2,
      • 81.5,
      • 78.6,
      • 71.1,
      • 51.9,
      • 33.3
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 66.38,
      • 61.4,
      • 70,
      • 69.8,
      • 64.3,
      • 49.6,
      • 20.7
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "Gemini 2.0 Flash",
      • 82.05,
      • 86.6,
      • 84.5,
      • 81.7,
      • 75.4,
      • 57.6,
      • 42.1
      ],
    • [
      • "GPT-4o",
      • 79.22,
      • 82,
      • 80.9,
      • 76.3,
      • 77.7,
      • 51.8,
      • 38.5
      ],
    • [
      • "Gemma 3 27B",
      • 75.92,
      • 76.2,
      • 75.8,
      • 75.5,
      • 76.2,
      • 48.4,
      • 35.1
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 65.15,
      • 59.3,
      • 64.1,
      • 68,
      • 69.2,
      • 46.1,
      • 26.3
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "GPT-4o",
      • 87.55,
      • 99.2,
      • 86.8,
      • 87,
      • 77.2,
      • 58.6,
      • 44.3
      ],
    • [
      • "Gemini 2.0 Flash",
      • 87.48,
      • 99.5,
      • 87.7,
      • 87.5,
      • 75.2,
      • 60.8,
      • 45.9
      ],
    • [
      • "Gemma 3 27B",
      • 85.72,
      • 99.3,
      • 85.3,
      • 83.3,
      • 75,
      • 56.5,
      • 38.8
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 77.75,
      • 96.1,
      • 73.2,
      • 74.5,
      • 67.2,
      • 53.5,
      • 27.4
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "GPT-4o",
      • 89.65,
      • 100,
      • 87.2,
      • 90.2,
      • 81.2,
      • 56.3,
      • 33.7
      ],
    • [
      • "Gemini 2.0 Flash",
      • 87.25,
      • 99.8,
      • 86.7,
      • 85.9,
      • 76.6,
      • 57.4,
      • 32.8
      ],
    • [
      • "Gemma 3 27B",
      • 86.9,
      • 99.7,
      • 85,
      • 87.7,
      • 75.2,
      • 54.5,
      • 29.7
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 81.62,
      • 98.9,
      • 76.2,
      • 80.7,
      • 70.7,
      • 52.6,
      • 24
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "GPT-4o",
      • 90.82,
      • 98.4,
      • 87.5,
      • 92.6,
      • 84.8,
      • 64.9,
      • 56.5
      ],
    • [
      • "Gemini 2.0 Flash",
      • 89.22,
      • 97.9,
      • 88.2,
      • 87.2,
      • 83.6,
      • 66.2,
      • 56.9
      ],
    • [
      • "Gemma 3 27B",
      • 88.07,
      • 94.2,
      • 85.1,
      • 90.3,
      • 82.7,
      • 62.9,
      • 52.7
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 79.7,
      • 83.1,
      • 74.2,
      • 84.9,
      • 76.6,
      • 60.3,
      • 43.2
      ]
    ],
  • "metadata": null
}
{
  • "headers": [
    • "Model",
    • "Average ⬆️ (Class. Tasks)",
    • "LID",
    • "TC",
    • "RC-QA",
    • "NLI",
    • "MT (xx-en)",
    • "MT (en-xx)"
    ],
  • "data": [
    • [
      • "GPT-4o",
      • 87.95,
      • 95.7,
      • 87.5,
      • 92.8,
      • 75.8,
      • 62.4,
      • 49.2
      ],
    • [
      • "Gemini 2.0 Flash",
      • 86.92,
      • 97.8,
      • 88,
      • 86.2,
      • 75.7,
      • 63.9,
      • 51.1
      ],
    • [
      • "Gemma 3 27B",
      • 85.3,
      • 90,
      • 85,
      • 89.2,
      • 77,
      • 60.3,
      • 48.5
      ],
    • [
      • "Gemma-3n-E4B-it",
      • 77.85,
      • 80.4,
      • 74.7,
      • 83.1,
      • 73.2,
      • 57.9,
      • 39.7
      ]
    ],
  • "metadata": null
}