mSTEB Leaderboard
This leaderboard has the results of evaluation of models on mSTEB benchmark.
Select Region
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 81.18,
- 90.5,
- 83.5,
- 83.4,
- 67.3,
- 57,
- 42.7
- [
- "GPT-4o",
- 79.3,
- 86.4,
- 80.6,
- 82.3,
- 67.9,
- 53.3,
- 40.7
- [
- "Gemma 3 27B",
- 74.93,
- 81,
- 75.8,
- 77.5,
- 65.4,
- 49.7,
- 36.4
- [
- "Gemma-3n-E4B-it",
- 66.03,
- 70.6,
- 64.4,
- 69.6,
- 59.5,
- 47.3,
- 26
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 74.03,
- 80,
- 74.9,
- 77,
- 64.2,
- 46.2,
- 32
- [
- "GPT-4o",
- 67.8,
- 69.6,
- 69.6,
- 70.2,
- 61.8,
- 41,
- 30.5
- [
- "Gemma 3 27B",
- 60.45,
- 62.9,
- 59.9,
- 60.6,
- 58.4,
- 34.8,
- 24
- [
- "Gemma-3n-E4B-it",
- 50.85,
- 52.5,
- 47.7,
- 51.2,
- 52,
- 32.7,
- 13.8
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 77.47,
- 99.5,
- 82.9,
- 80.7,
- 46.8,
- 52.6,
- 42.9
- [
- "GPT-4o",
- 76.85,
- 99.6,
- 80.5,
- 78.2,
- 49.1,
- 49,
- 40.8
- [
- "Gemma 3 27B",
- 64.28,
- 79.2,
- 69.5,
- 62.3,
- 46.1,
- 41.2,
- 32.4
- [
- "Gemma-3n-E4B-it",
- 58.53,
- 81,
- 58.3,
- 53.5,
- 41.3,
- 38.2,
- 16
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 82.5,
- 87.2,
- 86.7,
- 84.7,
- 71.4,
- 59.6,
- 39.2
- [
- "GPT-4o",
- 80.18,
- 83.2,
- 82.1,
- 83,
- 72.4,
- 54.2,
- 34.2
- [
- "Gemma 3 27B",
- 77.35,
- 78.2,
- 81.5,
- 78.6,
- 71.1,
- 51.9,
- 33.3
- [
- "Gemma-3n-E4B-it",
- 66.38,
- 61.4,
- 70,
- 69.8,
- 64.3,
- 49.6,
- 20.7
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 82.05,
- 86.6,
- 84.5,
- 81.7,
- 75.4,
- 57.6,
- 42.1
- [
- "GPT-4o",
- 79.22,
- 82,
- 80.9,
- 76.3,
- 77.7,
- 51.8,
- 38.5
- [
- "Gemma 3 27B",
- 75.92,
- 76.2,
- 75.8,
- 75.5,
- 76.2,
- 48.4,
- 35.1
- [
- "Gemma-3n-E4B-it",
- 65.15,
- 59.3,
- 64.1,
- 68,
- 69.2,
- 46.1,
- 26.3
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "GPT-4o",
- 87.55,
- 99.2,
- 86.8,
- 87,
- 77.2,
- 58.6,
- 44.3
- [
- "Gemini 2.0 Flash",
- 87.48,
- 99.5,
- 87.7,
- 87.5,
- 75.2,
- 60.8,
- 45.9
- [
- "Gemma 3 27B",
- 85.72,
- 99.3,
- 85.3,
- 83.3,
- 75,
- 56.5,
- 38.8
- [
- "Gemma-3n-E4B-it",
- 77.75,
- 96.1,
- 73.2,
- 74.5,
- 67.2,
- 53.5,
- 27.4
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "GPT-4o",
- 89.65,
- 100,
- 87.2,
- 90.2,
- 81.2,
- 56.3,
- 33.7
- [
- "Gemini 2.0 Flash",
- 87.25,
- 99.8,
- 86.7,
- 85.9,
- 76.6,
- 57.4,
- 32.8
- [
- "Gemma 3 27B",
- 86.9,
- 99.7,
- 85,
- 87.7,
- 75.2,
- 54.5,
- 29.7
- [
- "Gemma-3n-E4B-it",
- 81.62,
- 98.9,
- 76.2,
- 80.7,
- 70.7,
- 52.6,
- 24
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "GPT-4o",
- 90.82,
- 98.4,
- 87.5,
- 92.6,
- 84.8,
- 64.9,
- 56.5
- [
- "Gemini 2.0 Flash",
- 89.22,
- 97.9,
- 88.2,
- 87.2,
- 83.6,
- 66.2,
- 56.9
- [
- "Gemma 3 27B",
- 88.07,
- 94.2,
- 85.1,
- 90.3,
- 82.7,
- 62.9,
- 52.7
- [
- "Gemma-3n-E4B-it",
- 79.7,
- 83.1,
- 74.2,
- 84.9,
- 76.6,
- 60.3,
- 43.2
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "NLI",
- "MT (xx-en)",
- "MT (en-xx)"
- "data": [
- [
- "GPT-4o",
- 87.95,
- 95.7,
- 87.5,
- 92.8,
- 75.8,
- 62.4,
- 49.2
- [
- "Gemini 2.0 Flash",
- 86.92,
- 97.8,
- 88,
- 86.2,
- 75.7,
- 63.9,
- 51.1
- [
- "Gemma 3 27B",
- 85.3,
- 90,
- 85,
- 89.2,
- 77,
- 60.3,
- 48.5
- [
- "Gemma-3n-E4B-it",
- 77.85,
- 80.4,
- 74.7,
- 83.1,
- 73.2,
- 57.9,
- 39.7
- [
- "metadata": null
Select Region
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 79.4,
- 86.1,
- 79.5,
- 72.6,
- 10.3,
- 53.5
- [
- "GPT-4o-audio",
- 73.53,
- 81.2,
- 76,
- 63.4,
- 30.1,
- 47.3
- [
- "Gemma-3n-E4B-it",
- 61.23,
- 65.4,
- 62.9,
- 55.4,
- 29.2,
- 32.4
- [
- "Qwen2-Audio-7B-Instruct",
- 29.83,
- 15.8,
- 40.4,
- 33.3,
- 88.4,
- 20.5
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 64.17,
- 68.5,
- 68.5,
- 55.5,
- 18,
- 37.6
- [
- "GPT-4o-audio",
- 52.5,
- 56.6,
- 55.3,
- 45.6,
- 60.1,
- 28.5
- [
- "Gemma-3n-E4B-it",
- 44.63,
- 51,
- 45.2,
- 37.7,
- 31.5,
- 24.6
- [
- "Qwen2-Audio-7B-Instruct",
- 18.17,
- 0,
- 28.2,
- 26.3,
- 83.9,
- 13.2
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "GPT-4o-audio",
- 70.2,
- 98.1,
- 66.9,
- 45.6,
- 31.7,
- 36.5
- [
- "Gemini 2.0 Flash",
- 64.1,
- 96.7,
- 44.7,
- 50.9,
- 19.5,
- 27.2
- [
- "Gemma-3n-E4B-it",
- 50.77,
- 89.3,
- 30.7,
- 32.3,
- 41,
- 19.9
- [
- "Qwen2-Audio-7B-Instruct",
- 14.2,
- 0.3,
- 15.2,
- 27.1,
- 146.2,
- 11.9
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 84,
- 97.5,
- 83,
- 71.5,
- 7.7,
- 58.3
- [
- "GPT-4o-audio",
- 77.93,
- 91.8,
- 79.8,
- 62.2,
- 35.1,
- 47.3
- [
- "Gemma-3n-E4B-it",
- 65.53,
- 71.7,
- 67.9,
- 57,
- 38.3,
- 28.3
- [
- "Qwen2-Audio-7B-Instruct",
- 22.53,
- 4.5,
- 32.8,
- 30.3,
- 113.7,
- 16
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 83.03,
- 91,
- 82.3,
- 75.8,
- 7.3,
- 57.8
- [
- "GPT-4o-audio",
- 76.2,
- 88.8,
- 76,
- 63.8,
- 35,
- 48
- [
- "Gemma-3n-E4B-it",
- 63.7,
- 70,
- 64.8,
- 56.3,
- 33.3,
- 30.3
- [
- "Qwen2-Audio-7B-Instruct",
- 24.97,
- 10.5,
- 33,
- 31.4,
- 97.8,
- 16
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 81.6,
- 88,
- 82.1,
- 74.7,
- 6.7,
- 55.5
- [
- "GPT-4o-audio",
- 74.63,
- 80.2,
- 79.1,
- 64.6,
- 27.2,
- 47.5
- [
- "Gemma-3n-E4B-it",
- 65.47,
- 74.2,
- 66.2,
- 56,
- 25.8,
- 35.2
- [
- "Qwen2-Audio-7B-Instruct",
- 26.73,
- 21.7,
- 28.4,
- 30.1,
- 111.7,
- 15.6
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 86.3,
- 98.1,
- 83.5,
- 77.3,
- 13.9,
- 52.8
- [
- "GPT-4o-audio",
- 80.33,
- 90.5,
- 80.5,
- 70,
- 23.2,
- 47.2
- [
- "Gemma-3n-E4B-it",
- 60.13,
- 62.3,
- 64.3,
- 53.8,
- 49.1,
- 26.2
- [
- "Qwen2-Audio-7B-Instruct",
- 53.8,
- 59.2,
- 58.7,
- 43.5,
- 48.2,
- 34
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 82.77,
- 84.5,
- 82.1,
- 81.7,
- 7.3,
- 58.3
- [
- "GPT-4o-audio",
- 81.47,
- 86.1,
- 84,
- 74.3,
- 12.4,
- 56.9
- [
- "Gemma-3n-E4B-it",
- 66.3,
- 64.3,
- 68.1,
- 66.5,
- 22.8,
- 39.1
- [
- "Qwen2-Audio-7B-Instruct",
- 41.57,
- 28.6,
- 54.5,
- 41.6,
- 62.6,
- 29.6
- [
- "metadata": null
{
- "headers": [
- "Model",
- "Average ⬆️ (Class. Tasks)",
- "LID",
- "TC",
- "RC-QA",
- "ASR",
- "S2TT (xx-en)"
- "data": [
- [
- "Gemini 2.0 Flash",
- 87.03,
- 94.1,
- 84.9,
- 82.1,
- 9.9,
- 62.2
- [
- "GPT-4o-audio",
- 83.33,
- 90.4,
- 85.6,
- 74,
- 12,
- 59.3
- [
- "Gemma-3n-E4B-it",
- 70.1,
- 71.3,
- 72.9,
- 66.1,
- 20,
- 38.9
- [
- "Qwen2-Audio-7B-Instruct",
- 34,
- 11.8,
- 53.6,
- 36.6,
- 98.2,
- 23.9
- [
- "metadata": null
Reproducibility
To reproduce our results please look at the github page for mSTEB:
Submit your results
Please provide the model name, csv file and select the appropriate result type to upload your evaluation results for mSTEB.
Kindly format the results in the same way as provided in the sample csv files below.