|
TAPNext: Tracking Any Point (TAP) as Next Token Prediction
Artem Zholus, Carl Doersch, Yi Yang, Skanda Koppula, Viorica Patraucean, Xu He, Ignacio Rocco, Mehdi S. M. Sajjadi, Sarath Chandar, Ross Goroshin
in ICCV 2025
@inproceedings{zholus2025tapnext,
title={TAPNext: Tracking Any Point (TAP) as Next Token Prediction},
author={Zholus, Artem and Doersch, Carl and Yang, Yi and Koppula, Skanda and Patraucean, Viorica and He, Xu and Rocco, Ignacio and Sajjadi, Mehdi S. M. and Chandar, Sarath and Goroshin, Ross},
booktitle={International Conference on Computer Vision (ICCV)},
year={2025}
}
|
|
|
Direct Motion Models for Assessing Generated Videos
Kelsey Allen, Carl Doersch, Guangyao Zhou, Mohammed Suhail, Danny Driess, Ignacio Rocco, Yulia Rubanova, Thomas Kipf, Mehdi S. M. Sajjadi, Kevin Murphy, Joao Carreira, Sjoerd van Steenkiste
in ICML 2025
@inproceedings{allen2025direct,
title={Direct Motion Models for Assessing Generated Videos},
author={Allen, Kelsey and Doersch, Carl and Zhou, Guangyao and Suhail, Mohammed and Driess, Danny and Rocco, Ignacio and Rubanova, Yulia and Kipf, Thomas and Sajjadi, Mehdi S. M. and Murphy, Kevin and Carreira, Joao and van Steenkiste, Sjoerd},
booktitle={International Conference on Machine Learning (ICML)},
year={2025}
}
|
|
Motion Prompting: Controlling Video Generation with Motion Trajectories
Daniel Geng, Charles Herrmann, Junhwa Hur, Forrester Cole, Serena Zhang, Tobias Pfaff, Tatiana Lopez-Guevara, Carl Doersch, Yusuf Aytar, Michael Rubinstein, Chen Sun, Oliver Wang, Andrew Owens, Deqing Sun
@article{geng2024motion,
title={Motion Prompting: Controlling Video Generation with Motion Trajectories},
author={Geng, Daniel and Herrmann, Charles and Hur, Junhwa and Cole, Forrester and Zhang, Serena and Pfaff, Tobias and Lopez-Guevara, Tatiana and Doersch, Carl and Aytar, Yusuf and Rubinstein, Michael and Sun, Chen and Wang, Oliver and Owens, Andrew and Sun, Deqing},
journal={arXiv preprint arXiv:2412.02700},
year={2024}
}
|
|
Gen2Act: Human Video Generation in Novel Scenarios enables Generalizable Robot Manipulation
Homanga Ballav, Suneel Belkhale, Philipp Kühenbühl, Kanika Madan, Carl Doersch, Igor Mordatch, Deepak Pathak
in CoRL 2025
@inproceedings{ballav2025gen2act,
title={Gen2Act: Human Video Generation in Novel Scenarios enables Generalizable Robot Manipulation},
author={Ballav, Homanga and Belkhale, Suneel and K{\"u}henb{\"u}hl, Philipp and Madan, Kanika and Doersch, Carl and Mordatch, Igor and Deepak Pathak},
booktitle={Conference on Robot Learning (CoRL)},
year={2025}
}
|
|
TAPVid-3D: A Benchmark for Tracking Any Point in 3D
Skanda Koppula, Ignacio Rocco, Yi Yang, Joe Heyward, Joao Carreira, Andrew Zisserman, Gabriel Brostow, Carl Doersch in NeurIPS 2024
@inproceedings{koppula2024tapvid,
title={TAPVid-3D: A Benchmark for Tracking Any Point in 3D},
author={Koppula, Skanda and Rocco, Ignacio and Yang, Yi and Heyward, Joe and Carreira, Joao and Zisserman, Andrew and Brostow, Gabriel and Doersch, Carl},
booktitle={Neural Information Processing Systems (NeurIPS)},
year={2024}
}
|
|
BootsTAP: Bootstrapped Training for Tracking-Any-Point
Carl Doersch, Pauline Luc, Yi Yang, Dilara Gokay, Skanda Koppula, Ankush Gupta, Joseph Heyward, Ignacio Rocco, Ross Goroshin, João Carreira, Andrew Zisserman in ACCV 2024
@inproceedings{doersch2024bootstap,
title={BootsTAP: Bootstrapped Training for Tracking-Any-Point},
author={Doersch, Carl and Luc, Pauline and Yang, Yi and Gokay, Dilara and Koppula, Skanda and Gupta, Ankush and Heyward, Joseph and Rocco, Ignacio and Goroshin, Ross and Carreira, Jo{\~a}o and Zisserman, Andrew},
booktitle={Asian Conference on Computer Vision (ACCV)},
year={2024}
}
|
|
RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation
Mel Vecerik, Carl Doersch, Yi Yang, Todor Davchev, Yusuf Aytar, Guangyao Zhou, Raia Hadsell, Lourdes Agapito, Jon Scholz
in ICRA 2024
@inproceedings{vecerik2024robotap,
title={RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation},
author={Vecerik, Mel and Doersch, Carl and Yang, Yi and Davchev, Todor and Aytar, Yusuf and Zhou, Guangyao and Hadsell, Raia and Agapito, Lourdes and Scholz, Jon},
booktitle={International Conference on Robotics and Automation (ICRA)},
year={2024}
}
|
|
TAPIR: Tracking Any Point with per-frame Initialization and temporal Refinement
Carl Doersch, Yi Yang, Mel Vecerik, Dilara Gokay, Ankush Gupta, Yusuf Aytar, João Carreira, Andrew Zisserman
in ICCV 2023
@inproceedings{doersch2023tapir,
title={TAPIR: Tracking Any Point with per-frame Initialization and temporal Refinement},
author={Doersch, Carl and Yang, Yi and Vecerik, Mel and Gokay, Dilara and Gupta, Ankush and Yusuf Aytar and Carreira, Jo{\~a}o and Zissserman, Andrew},
booktitle={International Conference on Computer Vision (ICCV)},
year={2023}
}
|
|
The Perception Test
Viorica Patraucean, Lucas Smaira, Ankush Gupta, Adria Recasens Continente, Larisa Markeeva, Dylan Banarse, Skanda Koppula, Joseph Heyward, Mateusz Malinowski, Yi Yang, Carl Doersch, Tatiana Matejovicova, Yury Sulsky, Antoine Miech, Alex Frechette, Hanna Klimczak, Raphael Koster, Junlin Zhang, Stephanie Winkler, Yusuf Aytar, Simon Osindero, Dima Damen, Andrew Zisserman, Joao Carreira
ECCV/ICCV Workshop Series
@article{patraucean2023perception,
title={The Perception Test: A Large-Scale Multimodal Benchmark for General Perception},
author={Patraucean, Viorica and Smaira, Lucas and Gupta, Ankush and Continente, Adri{\`a} Recasens and Markeeva, Larisa and Banarse, Dylan and Koppula, Skanda and Heyward, Joseph and Malinowski, Mateusz and Yang, Yi and Doersch, Carl and Matejovicova, Tatiana and Sulsky, Yury and Miech, Antoine and Frechette, Alex and Klimczak, Hanna and Koster, Raphael and Zhang, Junlin and Winkler, Stephanie and Aytar, Yusuf and Osindero, Simon and Damen, Dima and Zisserman, Andrew and Carreira, Jo{\~a}o},
journal={arXiv preprint arXiv:2305.13786},
year={2023}
}
|
|
TAP-Vid: A Benchmark for Tracking Any Point in a Video
Carl Doersch, Ankush Gupta, Larisa Markeeva, Adrià Recasens, Lucas Smaira, Yusuf Aytar, João Carreira, Andrew Zisserman, Yi Yang
in NeurIPS Datasets and Benchmarks 2022
@inproceedings{doersch2022tapvid,
title={TAP-Vid: A Benchmark for Tracking Any Point in a Video},
author={Doersch, Carl and Gupta, Ankush and Markeeva, Larisa and Recasens, Adri{\`a} and Smaira, Lucas and Aytar, Yusuf and Carreira, Jo{\~a}o and Zisserman, Andrew and Yang, Yi},
booktitle={NeurIPS Datasets and Benchmarks},
year={2022}
}
|
|
Input-level Inductive Biases for 3D Reconstruction
Wang Yifan, Carl Doersch, Relja Arandjelovic, Joao Carreira, Andrew Zisserman
in CVPR 2022
@inproceedings{yifan2022input,
title={Input-level Inductive Biases for 3D Reconstruction},
author={Yifan, Wang and Doersch, Carl and Arandjelovic, Relja and Carreira, Joao and Zisserman, Andrew},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2022}
}
|
|
Kubric: A Scalable Dataset Generator
Klaus Greff, Francois Belletti, Lucas Beyer, Carl Doersch, Yilun Du, Daniel Duckworth, David J. Fleet, Dan Gnanapragasam, Florian Golemo, Charles Herrmann, Thomas Kipf, Abhijit Kundu, Dmitry Lagun, Issam Laradji, Hsueh-Ti (Derek) Liu, Henning Meyer, Yishu Miao, Derek Nowrouzezahrai, Cengiz Oztireli, Etienne Pot, Noha Radwan, Daniel Rebain, Sara Sabour, Mehdi S. M. Sajjadi, Matan Sela, Vincent Sitzmann, Austin Stone, Deqing Sun, Suhani Vora, Ziyu Wang, Tianhao Wu, Kwang Moo Yi, Fangcheng Zhong, Andrea Tagliasacchi
in CVPR 2022
@inproceedings{greff2022kubric,
title={Kubric: A Scalable Dataset Generator},
author={Greff, Klaus and Belletti, Francois and Beyer, Lucas and Doersch, Carl and Du, Yilun and Duckworth, Daniel and Fleet, David J and Gnanapragasam, Dan and Golemo, Florian and Herrmann, Charles and Kipf, Thomas and Kundu, Abhijit and Lagun, Dmitry and Laradji, Issam and Liu, Hsueh-Ti Derek and Meyer, Henning and Miao, Yishu and Nowrouzezahrai, Derek and Oztireli, Cengiz and Pot, Etienne and Radwan, Noha and Rebain, Daniel and Sabour, Sara and Sajjadi, Mehdi SM and Sela, Matan and Sitzmann, Vincent and Stone, Austin and Sun, Deqing and Vora, Suhani and Wang, Ziyu and Wu, Tianhao and Yi, Kwang Moo and Zhong, Fangcheng and Tagliasacchi, Andrea},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2022}
}
|
|
Perceiver IO: A General Architecture for Structured Inputs & Outputs
Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Henaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Joao Carreira
in ICLR 2022
@inproceedings{jaegle2022perceiver,
title={Perceiver IO: A General Architecture for Structured Inputs \& Outputs},
author={Jaegle, Andrew and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Doersch, Carl and Ionescu, Catalin and Ding, David and Koppula, Skanda and Zoran, Daniel and Brock, Andrew and Shelhamer, Evan and Henaff, Olivier and Botvinick, Matthew M and Zisserman, Andrew and Vinyals, Oriol and Carreira, Joao},
booktitle={International Conference on Learning Representations (ICLR)},
year={2022}
}
|
|
Inferring a Continuous Distribution of Atom Coordinates from Cryo-EM Images using VAEs
Dan Rosenbaum, Marta Garnelo, Michal Zielinski, Charlie Beattie, Ellen Clancy, Andrea Huber, Pushmeet Kohli, Andrew W. Senior, John Jumper, Carl Doersch, S. M. Ali Eslami, Olaf Ronneberger, Jonas Adler
in NeurIPS 2021 workshop on Machine Learning in Structural Biology
@inproceedings{rosenbaum2021inferring,
title={Inferring a Continuous Distribution of Atom Coordinates from Cryo-EM Images using VAEs},
author={Rosenbaum, Dan and Garnelo, Marta and Zielinski, Michal and Beattie, Charlie and Clancy, Ellen and Huber, Andrea and Kohli, Pushmeet and Senior, Andrew W and Jumper, John and Doersch, Carl and Eslami, SM Ali and Ronneberger, Olaf and Adler, Jonas},
booktitle={NeurIPS Workshop on Machine Learning in Structural Biology},
year={2021}
}
|
|
CrossTransformers: spatially-aware few-shot transfer
Carl Doersch, Ankush Gupta, Andrew Zisserman in NeurIPS 2020
@inproceedings{doersch2020crosstransformers,
title={CrossTransformers: spatially-aware few-shot transfer},
author={Doersch, Carl and Gupta, Ankush and Zisserman, Andrew},
booktitle={Neural Information Processing Systems (NeurIPS)},
year={2020}
}
|
|
Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning
Jean-Bastien Grill, Florian Strub, Florent Altché, Corentin Tallec, Pierre H. Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Daniel Guo, Mohammad Gheshlaghi Azar, Bilal Piot, Koray Kavukcuoglu, Rémi Munos, Michal Valko in NeurIPS 2020 (Oral)
@inproceedings{grill2020bootstrap,
title={Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning},
author={Grill, Jean-Bastien and Strub, Florian and Altch{\'e}, Florent and Tallec, Corentin and Richemond, Pierre H and Buchatskaya, Elena and Doersch, Carl and Avila Pires, Bernardo and Guo, Zhaohan Daniel and Gheshlaghi Azar, Mohammad and Piot, Bilal and Kavukcuoglu, Koray and Munos, R{\'e}mi and Valko, Michal},
booktitle={Neural Information Processing Systems (NeurIPS)},
year={2020}
}
|
|
Data-Efficient Image Recognition with Contrastive Predictive Coding
Olivier J. Hénaff, Aravind Srinivas, Jeffrey De Fauw, Ali Razavi, Carl Doersch, S. M. Ali Eslami, Aaron van den Oord in ICML 2020
@inproceedings{henaff2020data,
title={Data-Efficient Image Recognition with Contrastive Predictive Coding},
author={H{\'e}naff, Olivier J and Srinivas, Aravind and De Fauw, Jeffrey and Razavi, Ali and Doersch, Carl and Eslami, SM Ali and van den Oord, Aaron},
booktitle={International Conference on Machine Learning (ICML)},
year={2020}
}
|
|
Sim2real transfer learning for 3D human pose estimation: motion to the rescue
Carl Doersch, Andrew Zisserman in NeurIPS 2019
@inproceedings{doersch2019sim2real,
title={Sim2real transfer learning for 3D human pose estimation: motion to the rescue},
author={Doersch, Carl and Zisserman, Andrew},
booktitle={Neural Information Processing Systems (NeurIPS)},
year={2019}
}
|
|
Exploiting temporal context for 3D human pose estimation in the wild
Anurag Arnab, Carl Doersch, Andrew Zisserman in CVPR 2019
@inproceedings{arnab2019exploiting,
title={Exploiting temporal context for 3D human pose estimation in the wild},
author={Arnab, Anurag and Doersch, Carl and Zisserman, Andrew},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2019}
}
|
|
Video Action Transformer Network
Rohit Girdhar, João Carreira, Carl Doersch, Andrew Zisserman in CVPR 2019
@inproceedings{girdhar2019video,
title={Video Action Transformer Network},
author={Girdhar, Rohit and Carreira, Joao and Doersch, Carl and Zisserman, Andrew},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2019}
}
A Better Baseline for AVA
Rohit Girdhar, João Carreira, Carl Doersch, Andrew Zisserman in CVPR 2018 ActivityNet Workshop
@inproceedings{girdhar2018better,
title={A Better Baseline for AVA},
author={Girdhar, Rohit and Carreira, Joao and Doersch, Carl and Zisserman, Andrew},
booktitle={CVPR ActivityNet Workshop},
year={2018}
}
|
|
|
Kickstarting Deep Reinforcement Learning
Simon Schmitt, Jony Hudson, Augustin Zidek, Simon Osindero, Carl Doersch, Wojciech Czarnecki, Joel Leibo, Heinrich Kuttler, Andrew Zisserman, Karen Simonyan, Ali Eslami
in NIPS 2018 Reinforcement Learning Workshop
@article{schmitt2018kickstarting,
title={Kickstarting Deep Reinforcement Learning},
author={Schmitt, Simon and Hudson, Jony and Zidek, Augustin and Osindero, Simon and Doersch, Carl and Czarnecki, Wojciech and Leibo, Joel and Kuttler, Heinrich and Zisserman, Andrew and Simonyan, Karen and Eslami, Ali},
journal={arXiv preprint arXiv:1803.03835},
year={2018}
}
|
|
Learning Visual Question Answering by Bootstrapping Hard Attention
Mateusz Malinowski, Carl Doersch, Adam Santoro, Peter Battaglia
in ECCV 2018
@inproceedings{malinowski2018learning,
title={Learning Visual Question Answering by Bootstrapping Hard Attention},
author={Malinowski, Mateusz and Doersch, Carl and Santoro, Adam and Battaglia, Peter},
booktitle={European Conference on Computer Vision (ECCV)},
year={2018}
}
The Visual QA Devil in the Details: The Impact of Early Fusion and Batch Norm on CLEVR
Mateusz Malinowski, Carl Doersch
in ECCV 2018 Workshop on Shortcomings in Vision and Language
@inproceedings{malinowski2018visual,
title={The Visual QA Devil in the Details: The Impact of Early Fusion and Batch Norm on CLEVR},
author={Malinowski, Mateusz and Doersch, Carl},
booktitle={ECCV Workshop on Shortcomings in Vision and Language},
year={2018}
}
|
|
Multi-task Self-Supervised Visual Learning
Carl Doersch and Andrew Zisserman
in ICCV 2017
@inproceedings{doersch2017multitask,
title = {Multi-task Self-Supervised Visual Learning},
author = {Doersch, Carl and Zisserman, Andrew},
booktitle = {International Conference on Computer Vision},
year = {2017},
}
|
|
Supervision Beyond Manual Annotations for Learning Visual Representations
Carl Doersch.
Carnegie Mellon Thesis Dissertation
@phdthesis{doersch2016unsupervised,
title = {Supervision Beyond Manual Annotations for Learning Visual Representations},
author = {Doersch, Carl},
school = {Carnegie Mellon University},
year = {2016},
}
|
|
Tutorial on Variational Autoencoders
Carl Doersch.
Arxiv Tech Report, June 2016
@article{doersch2016tutorial,
title = {Tutorial on Variational Autoencoders},
author = {Doersch, Carl},
journal = {arXiv preprint arXiv:1606.05908},
year = {2016},
}
|
|
An Uncertain Future: Forecasting from Static Images using Variational Autoencoders
Jacob Walker, Carl Doersch, Abhinav Gupta, and Martial Hebert.
in ECCV 2016
@inproceedings{walker2016uncertain,
title = {An Uncertain Future: Forecasting from Static Images using Variational Autoencoders},
author = {Walker, Jacob and Doersch, Carl and Gupta, Abhinav and Hebert, Martial},
booktitle = {European Conference on Computer Vision},
year = {2016},
}
|
|
Data-dependent Initializations of Convolutional Neural Networks
Philipp Krähenbühl, Carl Doersch, Jeff Donahue, and Trevor Darrell.
ICLR, 2016
@inproceedings{krahenbuhl2016data,
title={Data-dependent Initializations of Convolutional Neural Networks},
author={Kr{\"a}henb{\"u}hl, Philipp and Doersch, Carl and Donahue, Jeff and Darrell, Trevor},
booktitle={International Conference on Learning Representations (ICLR)},
year={2016}
}
|
|
Unsupervised Visual Representation Learning by Context Prediction
Carl Doersch, Abhinav Gupta, and Alexei A. Efros.
in ICCV 2015 (oral)
@inproceedings{doersch2015unsupervised,
title = {Unsupervised Visual Representation Learning by Context Prediction},
author = {Doersch, Carl and Gupta, Abhinav and Efros, Alexei A.},
booktitle = {International Conference on Computer Vision},
year = {2015},
}
|
|
Context as Supervisory Signal: Discovering Objects with Predictable Context
Carl Doersch, Abhinav Gupta, and Alexei A. Efros.
In ECCV 2014
@inproceedings{doersch2014context,
title = {Context as Supervisory Signal: Discovering Objects with Predictable Context},
author = {Doersch, Carl and Gupta, Abhinav and Efros, Alexei A.},
booktitle = {European Conference on Computer Vision},
year = {2014},
}
|
|
Mid-Level Visual Element Discovery as Discriminative Mode Seeking
Carl Doersch, Abhinav Gupta, and Alexei A. Efros.
In NIPS 2013
@inproceedings{doersch2013mid,
title = {Mid-Level Visual Element Discovery as Discriminative Mode Seeking},
author = {Doersch, Carl and Gupta, Abhinav and Efros, Alexei A.},
booktitle = {Neural Information Processing Systems (NIPS)},
year = {2013},
}
|
|
What Makes Paris Look like Paris?
Carl Doersch, Saurabh Singh, Abhinav Gupta, Josef Sivic, and Alexei A. Efros.
In SIGGRAPH 2012 (oral) Republished on the cover of the CACM magazine Dec. 2015
@article{doersch2012what,
title = {What Makes Paris Look like Paris?},
author = {Carl Doersch and Saurabh Singh and Abhinav Gupta and Josef Sivic and Alexei A. Efros},
journal = {ACM Transactions on Graphics (SIGGRAPH)},
volume = {31},
number = {4},
year = {2012},
}
@article{doersch2015makes,
title={What makes Paris look like Paris?},
author={Doersch, Carl and Singh, Saurabh and Gupta, Abhinav and Sivic, Josef and Efros, Alexei A},
journal={Communications of the ACM},
volume={58},
number={12},
pages={103--110},
year={2015},
publisher={ACM}
}
|
|
Bounding the Probability of Error for High Precision Optical Character Recognition
Gary B. Huang, Andrew Kae, Carl Doersch, and Erik Learned-Miller.
In JMLR 2012
@article{huang2012bounding,
title={Bounding the Probability of Error for High Precision Optical Character Recognition},
author={Huang, G.B. and Kae, A. and Doersch, C. and Learned-Miller, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={363--387},
year={2012}
}
Improving state-of-the-art OCR through high-precision document-specific modeling.
Andrew Kae, Gary B. Huang, Carl Doersch, and Erik Learned-Miller.
In CVPR 2010
@INPROCEEDINGS{kae10improving,
author = {Andrew Kae and Gary B. Huang and Carl Doersch and Erik Learned-Miller},
title = {Improving state-of-the-art OCR through high-precision document-specific modeling.},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2010},
month = {Jun}
}
|
|
 |