## GPU Configurations used 


In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Mon Jun 14 15:38:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Loading the necessary libraries and datasets

Here we have saved the entire data in a zipped folder - "AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF.zip"
<br>
This zipped folder contains-
* Train and test csv
* Audio clips related to the both train and test data that needs to be unzipped

<br>
We will also create a folder "model_asr" where model checkpoints will be saved.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# !mkdir datasets
# !unzip '/content/drive/MyDrive/problem5.zip' -d datasets
# !unzip '/content/datasets/problem5/test.zip' -d '/content/datasets/problem5/test_clips'
# !unzip '/content/datasets/problem5/val.zip' -d '/content/datasets/problem5/val_clips'
# !unzip '/content/datasets/problem5/train.zip' -d '/content/datasets/problem5/train_clips'

# !mkdir model_asr


# !pip install aicrowd-cli
# !mkdir assets
# !aicrowd login --api-key $API_KEY



# !pip install --upgrade torch
# !pip install --upgrade datasets
# !pip install --upgrade transformers
# !pip install --upgrade torchaudio
# !pip install --upgrade librosa
# !pip install --upgrade jiwer
# !pip install --upgrade audiomentations
# !pip install --upgrade fuzzywuzzy


In [3]:
import pandas as pd
import numpy as np
import re
import json
import random
import os


import torch
import torchaudio
import librosa

from sklearn.model_selection import train_test_split

from datasets import Dataset, load_metric
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor


from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer




def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(13)

## Loading the train and test datasets.
<br> 
1. __Oversampling of less freqency data points__ :- We will oversample the datapoints where the frequency is less than 8 for training. 
2. __Train-test split__ :- We will now use stratified train test split with train-test split being (90-10) on the oversampled data 

In [4]:
import numpy as np

np.random.seed(13)

train_df               = pd.read_csv('/content/datasets/problem5/train.csv')
train_df['audio_path'] = '/content/datasets/problem5/train_clips/'+train_df['SoundID'].astype(str)+'.wav'

val_df                 = pd.read_csv('/content/datasets/problem5/val.csv')
val_df['audio_path']   = '/content/datasets/problem5/val_clips/'+val_df['SoundID'].astype(str)+'.wav'

test_df                = pd.read_csv('/content/datasets/problem5/test.csv')
test_df['audio_path']  = '/content/datasets/problem5/test_clips/'+test_df['SoundID'].astype(str)+'.wav'


train_df               = train_df.rename({'label':'transcription'},axis=1)
val_df                 = val_df.rename({'label':'transcription'},axis=1)
test_df                = test_df.rename({'label':'transcription'},axis=1)

print(f'The shape of train data :- {train_df.shape} , shape of validation data :- {val_df.shape} and shape of test data is {test_df.shape}')
print(f'Columns in train data      :- {train_df.columns.tolist()}')
print(f'Columns in validation data :- {val_df.columns.tolist()}')
print(f'Columns in test data       :- {test_df.columns.tolist()}')

# ## We need to find the 'transcription column' from the test dataset.



train_data = train_df[['audio_path','transcription']]
test_data  = val_df[['audio_path','transcription']]

train_data['transcription'] = train_data['transcription'].astype(str)
test_data['transcription']  = test_data['transcription'].astype(str)


print(f'The shape of train data and validation :- {train_data.shape,test_data.shape}')

The shape of train data :- (20000, 3) , shape of validation data :- (2000, 3) and shape of test data is (5000, 3)
Columns in train data      :- ['SoundID', 'transcription', 'audio_path']
Columns in validation data :- ['SoundID', 'transcription', 'audio_path']
Columns in test data       :- ['SoundID', 'transcription', 'audio_path']
The shape of train data and validation :- ((20000, 2), (2000, 2))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
print(f"Length of test clips - {len(os.listdir('/content/datasets/problem5/test_clips'))}")
print(f"Length of train clips - {len(os.listdir('/content/datasets/problem5/train_clips'))}")
print(f"Length of val clips - {len(os.listdir('/content/datasets/problem5/val_clips'))}")

Length of test clips - 5000
Length of train clips - 20000
Length of val clips - 2000


In [6]:
train_data = Dataset.from_pandas(train_data)
test_data  = Dataset.from_pandas(test_data)

## Creating and saving the Vocabulary
We will save the character based vocabulary based on train transcriptions as vocab.json file in the model_asr folder.
<br>
Important points:- 
1. We select some characters to remove and also convert entire batch of transcriptions to lowercase (to ensure uniformity) in both train and test dataset. 
2. We also remove other column names like Downvotes, Gender, upvotes  because they were not needed as of now for analysis (even though they can be used for data augmentation while training but to reduce the runtime and get reasonable results early, we dropped those columns)
3. We eventually saved the characters in vocab.json in model_asr folder where it will be used by the model we will be calling in later stages


In [7]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\)\(\�]'
def remove_special_characters(batch):
  '''
      We remove special characters from each sentence.
  '''
  batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
  return batch

train_data = train_data.map(remove_special_characters)
test_data  = test_data.map(remove_special_characters)

def extract_all_chars(batch):
  all_text = " ".join(batch["transcription"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = train_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_data.column_names)
vocab_test  = test_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
## vocabulary list from the train dataset
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))  ## provides a list of characters from a list
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print('The total vocabulary present in train transcriptions is ',len(vocab_dict))

The total vocabulary present in train transcriptions is  29


In [9]:
with open('/content/model_asr/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## Tokenizing and Feature Extraction
1. We use the vocabulary extracted in the previous step and feed it into the Wav2Vec2CTCTokenizer and also specify some basic tokens to be used while training 
2. The feature extraction steps will include the usage of feature_extractor with default values which enables us to get a float array of raw waveform of the clips. <br>
One important thing to note over here is the fact that we are using the samling rate as 16000 hz/second because our model is trained on the input wav form of 16000 hz/second only
3. We eventually save our processor which includes our tokenizer (from point1 and feature extractor from point2) into the model_asr folder

In [10]:
tokenizer         = Wav2Vec2CTCTokenizer("/content/model_asr/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor         = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

processor.save_pretrained("/content/model_asr")

### Here we convert the speech file(audio clips) to array 

In [11]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
    batch["speech"] = librosa.resample(np.asarray(speech_array[0].numpy()), sampling_rate, 16_000)
    batch["sampling_rate"] = 16_000
    batch["target_text"] = batch["transcription"]
    return batch

train_data = train_data.map(speech_file_to_array_fn)
test_data = test_data.map(speech_file_to_array_fn)

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [12]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, batch_size=8, batched=True)
test_data = test_data.map(prepare_dataset, remove_columns=test_data.column_names, batch_size=8, batched=True)

HBox(children=(FloatProgress(value=0.0, max=2500.0), HTML(value='')))

  return array(a, dtype, copy=False, order=order)





HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




## We create a Data Collator class here. 
The entire code is taken here - https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81
<br>
The arguments corresponding to the class are explained in the docstring below. We tried not to experiment much with this class because a small tweak(like changing max_length) was giving really bizarre results and hence we took the class as it is without any changes from the repo specified above.
<br>
Also, we initialise the metric to be used for calculations which is Word Error Rate.



In [13]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [14]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1947.0, style=ProgressStyle(description…




In [15]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Setting up model parameters

1. Model used is facebook's Wav2vec2-large-xlsr-53 which is a wav2vec model trained over 53 languages and we can finetune it to the language we need. 
2. The parameters selected in below steps are obtained by running WandB sweeps. With default parameters we were not getting good results but the sweeps helped us a lot in getting a lift in performance. 
3. The overall runtime of the trainer is about 12 hours on the GPU specified above and the checkpoints are saved in the folder model_asr as specified.

In [16]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.015715766711072065,
    feat_proj_dropout= 0.07570439532163029,
    activation_dropout=0.09145432252955588,
    hidden_dropout=0.0006515376406130203,
    
    mask_time_prob=0.05353409500178331,
    layerdrop=0.018085056635857365,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)


model.freeze_feature_extractor()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1768.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1269737156.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_q.weight', 'project_hid.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

In [17]:

training_args = TrainingArguments(
  output_dir="/content/model_asr",
  
  group_by_length=True,
  per_device_train_batch_size=4,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=20,
  dataloader_num_workers= 32, 
  load_best_model_at_end=True,
  metric_for_best_model='wer',
  greater_is_better=False,
  fp16=True,
  seed=13,
  save_steps=100,
  eval_steps=500,
  logging_steps=500,
  learning_rate=0.000095637994662983496,
  lr_scheduler_type = 'cosine_with_restarts',
  warmup_steps=500,
  save_total_limit=1,
)

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=processor.feature_extractor,
)

In [19]:
trainer.train()

  cpuset_checked))


Step,Training Loss,Validation Loss,Wer
500,4.1492,2.90632,0.999879
1000,2.5034,1.044372,0.965367
1500,0.8494,0.403222,0.614556
2000,0.5612,0.300918,0.531364
2500,0.4696,0.241468,0.483894
3000,0.3798,0.220342,0.460039
3500,0.3653,0.200401,0.454226
4000,0.3246,0.176424,0.4248
4500,0.3056,0.160251,0.405909
5000,0.2886,0.147907,0.400339


  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_che

TrainOutput(global_step=50000, training_loss=0.19017380695343017, metrics={'train_runtime': 46599.2282, 'train_samples_per_second': 1.073, 'total_flos': 0, 'epoch': 20.0, 'init_mem_cpu_alloc_delta': 846278656, 'init_mem_gpu_alloc_delta': 1261874176, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 455839744, 'train_mem_gpu_alloc_delta': 3777208320, 'train_mem_cpu_peaked_delta': 198139904, 'train_mem_gpu_peaked_delta': 789448704})

## Extracting the relevent checkpoint to be used on test dataset
We extract the checkpoint having least eval-WER from the checkpoints saved in the folder. The model and tokenizer are initialised with the model and tokenizer saved in the checkpoint and usual steps to preprocess are applied again as we did for train data to get the results needed.

In [20]:
eval_steps = 500
eval_wer = []
for log_history in trainer.state.log_history:
  if 'eval_wer' in log_history.keys():
    eval_wer.append(log_history['eval_wer'])
steps_taken = [eval_steps*(i+1) for i in range(len(eval_wer))]

eval_dict = dict(zip(steps_taken,eval_wer))


import os
files = os.listdir('/content/model_asr')
checkpoints_available = []
for file in files:
  if 'checkpoint' in file:
    checkpoints_available.append(file)

least_loss = 2
for chkpts in checkpoints_available:  
  number = int(re.findall(r'\d+', chkpts)[0])
  if eval_dict[number]< least_loss:
    least_loss = eval_dict[number]
    chkpt2consider = '/content/model_asr/'+chkpts

In [21]:
chkpt2consider

'/content/model_asr/checkpoint-42500'

In [22]:
model = Wav2Vec2ForCTC.from_pretrained(chkpt2consider).to("cuda")
processor = Wav2Vec2Processor.from_pretrained("/content/model_asr")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
test_df = test_df[['audio_path']]
test_data  = Dataset.from_pandas(test_df)



def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
    batch["speech"] = librosa.resample(np.asarray(speech_array[0].numpy()), sampling_rate, 16_000)
    batch["sampling_rate"] = 16_000
    
    return batch

test_data = test_data.map(speech_file_to_array_fn, remove_columns=test_data.column_names)



HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [24]:
def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch


In [None]:
result = test_data.map(evaluate, batched=True, batch_size=8)


HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))

## Saving the transcriptions and post processing (part 1)
In the first step of post processing, we remove special tokens from the transcriptions as well as any extra trailing space in the text attained.
<br>
Also, there was a datapoint which was empty (no transcription possible due to blank audio). In order to accommodate that, we make an adjustment based on sequence length of transcription attained (else the submissions were throwing error if '' is used instead of ' '). 

In [None]:
outputdf = pd.read_csv('/content/datasets/problem5/test.csv')
output = result["pred_strings"]
outputdf['transcription'] = output

In [None]:
subdf = outputdf[['SoundID','transcription']]
subdf.transcription=subdf.transcription.str.replace('\[PAD\]','')

subdf['transcription']=subdf['transcription'].str.strip()

In [None]:
subdf['length'] = subdf['transcription'].str.len()
subdf['transcription'] = subdf.apply(lambda z: z['transcription'] if z['length']>0 else " ",axis=1)

In [None]:
subdf = subdf.rename({"transcription":"label"},axis=1)
subdf

In [None]:

submisdf = subdf.copy()
submisdf.to_csv(os.path.join("assets", "submission.csv"), index=False)


## VALID SCORE


In [None]:
val_df = val_df[['audio_path']]
val_data  = Dataset.from_pandas(val_df)



def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
    batch["speech"] = librosa.resample(np.asarray(speech_array[0].numpy()), sampling_rate, 16_000)
    batch["sampling_rate"] = 16_000
    
    return batch

val_data = val_data.map(speech_file_to_array_fn, remove_columns=val_data.column_names)



In [None]:
def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch


In [None]:
result = val_data.map(evaluate, batched=True, batch_size=8)


In [None]:
val_df = pd.read_csv('/content/datasets/problem5/val.csv')
output = result["pred_strings"]
val_df['transcription'] = output

In [None]:
from jiwer import wer

val_df['Wer'] = val_df.apply(lambda z: wer(z['label'],z['transcription']),axis=1)

val_df['Wer'].mean()