I'm encountering an audio processing issue specifically on Kaggle, while the same code works without errors on Colab
voices = datasets.map(prepare_dataset, remove_columns=datasets.column_names["train"])
prepare_dataset function
import librosa
def prepare_dataset(batch):
audio = batch["audio"]
resampled_audio = librosa.resample(audio["array"], orig_sr=audio["sampling_rate"], target_sr=16000)
batch["input_features"] = feature_extractor(resampled_audio, sampling_rate=16000).input_features[0]
batch["labels"] = tokenizer(batch["sentence"]).input_ids
return batch
Error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[18], line 1
----> 1 voices = datasets.map(prepare_dataset, remove_columns=datasets.column_names["train"])
File /opt/conda/lib/python3.10/site-packages/datasets/dataset_dict.py:438, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
435 if cache_file_names is None:
436 cache_file_names = {k: None for k in self}
437 return DatasetDict(
--> 438 {
439 k: dataset.map(
440 function=function,
441 with_indices=with_indices,
442 with_rank=with_rank,
443 input_columns=input_columns,
444 batched=batched,
445 batch_size=batch_size,
446 drop_last_batch=drop_last_batch,
447 remove_columns=remove_columns,
448 keep_in_memory=keep_in_memory,
449 load_from_cache_file=load_from_cache_file,
450 cache_file_name=cache_file_names[k],
451 writer_batch_size=writer_batch_size,
452 features=features,
453 disable_nullable=disable_nullable,
454 fn_kwargs=fn_kwargs,
455 num_proc=num_proc,
456 desc=desc,
457 )
458 for k, dataset in self.items()
459 }
460 )
File /opt/conda/lib/python3.10/site-packages/datasets/dataset_dict.py:439, in <dictcomp>(.0)
435 if cache_file_names is None:
436 cache_file_names = {k: None for k in self}
437 return DatasetDict(
438 {
--> 439 k: dataset.map(
440 function=function,
441 with_indices=with_indices,
442 with_rank=with_rank,
443 input_columns=input_columns,
444 batched=batched,
445 batch_size=batch_size,
446 drop_last_batch=drop_last_batch,
447 remove_columns=remove_columns,
448 keep_in_memory=keep_in_memory,
449 load_from_cache_file=load_from_cache_file,
450 cache_file_name=cache_file_names[k],
451 writer_batch_size=writer_batch_size,
452 features=features,
453 disable_nullable=disable_nullable,
454 fn_kwargs=fn_kwargs,
455 num_proc=num_proc,
456 desc=desc,
457 )
458 for k, dataset in self.items()
459 }
460 )
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1955, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
1952 disable_tqdm = not logging.is_progress_bar_enabled()
1954 if num_proc is None or num_proc == 1:
-> 1955 return self._map_single(
1956 function=function,
1957 with_indices=with_indices,
1958 with_rank=with_rank,
1959 input_columns=input_columns,
1960 batched=batched,
1961 batch_size=batch_size,
1962 drop_last_batch=drop_last_batch,
1963 remove_columns=remove_columns,
1964 keep_in_memory=keep_in_memory,
1965 load_from_cache_file=load_from_cache_file,
1966 cache_file_name=cache_file_name,
1967 writer_batch_size=writer_batch_size,
1968 features=features,
1969 disable_nullable=disable_nullable,
1970 fn_kwargs=fn_kwargs,
1971 new_fingerprint=new_fingerprint,
1972 disable_tqdm=disable_tqdm,
1973 desc=desc,
1974 )
1975 else:
1977 def format_cache_file_name(cache_file_name, rank):
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:520, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
518 self: "Dataset" = kwargs.pop("self")
519 # apply actual function
--> 520 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
521 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
522 for dataset in datasets:
523 # Remove task templates if a column mapping of the template is no longer valid
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:487, in transmit_format.<locals>.wrapper(*args, **kwargs)
480 self_format = {
481 "type": self._format_type,
482 "format_kwargs": self._format_kwargs,
483 "columns": self._format_columns,
484 "output_all_columns": self._output_all_columns,
485 }
486 # apply actual function
--> 487 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
488 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
489 # re-apply format to the output
File /opt/conda/lib/python3.10/site-packages/datasets/fingerprint.py:458, in fingerprint_transform.<locals>._fingerprint.<locals>.wrapper(*args, **kwargs)
452 kwargs[fingerprint_name] = update_fingerprint(
453 self._fingerprint, transform, kwargs_for_fingerprint
454 )
456 # Call actual function
--> 458 out = func(self, *args, **kwargs)
460 # Update fingerprint of in-place transforms + update in-place history of transforms
462 if inplace: # update after calling func so that the fingerprint doesn't change if the function fails
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:2320, in Dataset._map_single(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)
2318 if not batched:
2319 for i, example in enumerate(pbar):
-> 2320 example = apply_function_on_filtered_inputs(example, i, offset=offset)
2321 if update_data:
2322 if i == 0:
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:2220, in Dataset._map_single.<locals>.apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples, offset)
2218 if with_rank:
2219 additional_args += (rank,)
-> 2220 processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
2221 if update_data is None:
2222 # Check if the function returns updated examples
2223 update_data = isinstance(processed_inputs, (Mapping, pa.Table))
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1915, in Dataset.map.<locals>.decorate.<locals>.decorated(item, *args, **kwargs)
1911 decorated_item = (
1912 Example(item, features=self.features) if not batched else Batch(item, features=self.features)
1913 )
1914 # Use the LazyDict internally, while mapping the function
-> 1915 result = f(decorated_item, *args, **kwargs)
1916 # Return a standard dict
1917 return result.data if isinstance(result, LazyDict) else result
Cell In[17], line 7, in prepare_dataset(batch)
6 def prepare_dataset(batch):
----> 7 audio = batch["audio"]
8 orig_sr = audio["sampling_rate"]
10 # Resample audio to the target sampling rate (e.g., 16000)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:117, in Example.__getitem__(self, key)
115 value = super().__getitem__(key)
116 if self.features and key in self.features:
--> 117 value = decode_nested_example(self.features[key], value) if value is not None else None
118 self[key] = value
119 del self.features[key]
File /opt/conda/lib/python3.10/site-packages/datasets/features/features.py:1087, in decode_nested_example(schema, obj)
1085 # Object with special decoding:
1086 elif isinstance(schema, (Audio, Image)):
-> 1087 return schema.decode_example(obj) if obj is not None else None
1088 return obj
File /opt/conda/lib/python3.10/site-packages/datasets/features/audio.py:103, in Audio.decode_example(self, value)
101 raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.")
102 elif path is not None and path.endswith("mp3"):
--> 103 array, sampling_rate = self._decode_mp3(file if file else path)
104 elif path is not None and path.endswith("opus"):
105 if file:
File /opt/conda/lib/python3.10/site-packages/datasets/features/audio.py:241, in Audio._decode_mp3(self, path_or_file)
238 except RuntimeError as err:
239 raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
--> 241 array, sampling_rate = torchaudio.load(path_or_file, format="mp3")
242 if self.sampling_rate and self.sampling_rate != sampling_rate:
243 if not hasattr(self, "_resampler") or self._resampler.orig_freq != sampling_rate:
File /opt/conda/lib/python3.10/site-packages/torchaudio/backend/sox_io_backend.py:226, in load(filepath, frame_offset, num_frames, normalize, channels_first, format)
224 buffer_size = get_buffer_size()
225 if format == "mp3":
--> 226 return _fallback_load_fileobj(
227 filepath,
228 frame_offset,
229 num_frames,
230 normalize,
231 channels_first,
232 format,
233 buffer_size,
234 )
235 warnings.warn(_deprecation_message)
236 ret = torchaudio.lib._torchaudio_sox.load_audio_fileobj(
237 filepath, frame_offset, num_frames, normalize, channels_first, format
238 )
File /opt/conda/lib/python3.10/site-packages/torchaudio/backend/sox_io_backend.py:34, in _fail_load_fileobj(fileobj, *args, **kwargs)
33 def _fail_load_fileobj(fileobj, *args, **kwargs):
---> 34 raise RuntimeError(f"Failed to load audio from {fileobj}")
RuntimeError: Failed to load audio from <_io.BytesIO object at 0x7b01dd5f8b80>