@inproceedings{486e364cb0e34ca899ee23ecddb3d440,
title = "Towards speech classification from acoustic and vocal tract data in real-time MRI",
abstract = "Real-time magnetic resonance image (rtMRI) data of the upper airway provides a rich source of information about vocal tract shaping that can inform phonemic analysis and classification.We describe a multimodal phonemic classifier that combines articulatory data with speech audio features to improve performance.A deep network model processes rtMRI video data using ResNet18 and speech audio using a custom CNN and then combines the two data streams using a Transformer layer to fully explore the correlation of the two streams towards better vowel-consonant-vowel classification via the Transformer's multi-head self-attention mechanism.The classification accuracy of both the unimodal and multimodal models show substantial improvement on previous work (> 38%).The addition of audio features improves classification accuracy in the multimodal model by 7% compared with the unimodal model using articulatory data.We analyze the model and discuss the phonetic implications.",
keywords = "multimodal networks, phonemic classification, real-time MRI, Transformer, vocal tract",
author = "Yaoyao Yue and Michael Proctor and Luping Zhou and Rijul Gupta and Tharinda Piyadasa and Amelia Gully and Kirrie Ballard and Craig Jin",
year = "2024",
doi = "10.21437/Interspeech.2024-840",
language = "English",
series = "INTERSPEECH",
publisher = "International Speech Communication Association",
pages = "1345--1349",
editor = "Itshak Lapidot and Sharon Gannot",
booktitle = "Interspeech 2024",
note = "Interspeech Conference (24th : 2024) ; Conference date: 01-09-2024 Through 05-09-2024",
}