@inproceedings{NEURIPS2021_cb3213ad, author = {Akbari, Hassan and Yuan, Liangzhe and Qian, Rui and Chuang, Wei-Hong and Chang, Shih-Fu and Cui, Yin and Gong, Boqing}, booktitle = {Advances in Neural Information Processing Systems}, editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan}, pages = {24206--24221}, publisher = {Curran Associates, Inc.}, title = {VATT: Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text}, url = {https://proceedings.neurips.cc/paper_files/paper/2021/file/cb3213ada48302953cb0f166464ab356-Paper.pdf}, volume = {34}, year = {2021} }