@InProceedings{vx2text_2021_CVPR, author = {Xudong Lin and Gedas Bertasius and Jue Wang and Shih-Fu Chang and Devi Parikh and Lorenzo Torresani}, title = {Vx2Text: End-to-End Learning of Video-Based Text Generation from Multimodal Inputs}, booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2021} }