@inproceedings{d32ea62114594bbdb4802672872715c4,
title = "VTT: Long-term Visual Tracking with Transformers",
abstract = "Long-term visual tracking is a challenging problem. State-of-the-art long-term trackers, e.g., GlobalTrack, utilize region proposal networks (RPNs) to generate target proposals. However, the performance of the trackers is affected by occlusions and large scale or ratio variations. To address these issues, in this paper, we are the first to propose a novel architecture with transformers for long-term visual tracking. Specifically, the proposed Visual Tracking Transformer (VTT) utilizes a transformer encoder-decoder architecture for aggregating global information to deal with occlusion and large scale or ratio variation. Furthermore, it also shows better discriminative power against instance-level distractors without the need for extra labeling and hard-sample mining. We conduct extensive experiments on three large-scale long-term tracking datasets and have achieved state-of-the-art performance.",
author = "Tianling Bian and Yang Hua and Tao Song and Zhengui Xue and Ruhui Ma and Neil Robertson and Haibing Guan",
year = "2021",
month = may,
day = "5",
language = "English",
isbn = "978-1-7281-8809-6",
series = "International Conference on Pattern Recognition (ICPR): Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "International Conference on Pattern Recognition (ICPR)",
address = "United States",
}