From 7c54e112383dfc9d9a42e42e7c070a54323d1e14 Mon Sep 17 00:00:00 2001 From: lukas-heilgenbrunner <lukas.heiligenbrunner@gmail.com> Date: Mon, 13 Jan 2025 22:36:44 +0100 Subject: [PATCH] add sgva clip to not used materials --- materialandmethods.typ | 20 +++++++++++++++++--- sources.bib | 10 ++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/materialandmethods.typ b/materialandmethods.typ index 425d27c..500745d 100644 --- a/materialandmethods.typ +++ b/materialandmethods.typ @@ -374,21 +374,35 @@ Its use of frozen pre-trained feature extractors is key to avoiding overfitting == Alternative Methods There are several alternative methods to few-shot learning which are not used in this bachelor thesis. -Either they performed worse on benchmarks compared to the used methods or they were released after my literature research. -#todo[Do it!] +Either they performed worse on benchmarks compared to the used methods or they were released after my initial literature research. -=== SgVA-CLIP +=== SgVA-CLIP (Semantic-guided Visual Adapting CLIP) // https://arxiv.org/pdf/2211.16191v2 // https://arxiv.org/abs/2211.16191v2 +SgVA-CLIP (Semantic-guided Visual Adapting CLIP) is a framework that improves few-shot learning by adapting pre-trained vision-language models like CLIP. +It focuses on generating better visual features for specific tasks while still using the general knowledge from the pre-trained model. +Instead of only aligning images and text, SgVA-CLIP includes a special visual adapting layer that makes the visual features more discriminative for the given task. +This process is supported by knowledge distillation, where detailed information from the pre-trained model guides the learning of the new visual features. +Additionally, the model uses contrastive losses to further refine both the visual and textual representations.~#cite(<peng2023sgvaclipsemanticguidedvisualadapting>) + +One advantage of SgVA-CLIP is that it can work well with very few labeled samples, making it suitable for applications like anomaly detection. +The use of pre-trained knowledge helps reduce the need for large datasets. +However, a disadvantage is that it depends heavily on the quality and capabilities of the pre-trained model. +If the pre-trained model lacks relevant information for the task, SgVA-CLIP might struggle to adapt. +This might be a no-go for anomaly detection tasks because the images in such tasks are often very task-specific and not covered by general pre-trained models. +Also, fine-tuning the model can require considerable computational resources, which might be a limitation in some cases.~#cite(<peng2023sgvaclipsemanticguidedvisualadapting>) + === TRIDENT // https://arxiv.org/pdf/2208.10559v1 // https://arxiv.org/abs/2208.10559v1 +== SOT // https://arxiv.org/pdf/2204.03065v1 // https://arxiv.org/abs/2204.03065v1 // anomaly detect +== GLASS // https://arxiv.org/pdf/2407.09359v1 // https://arxiv.org/abs/2407.09359v1 diff --git a/sources.bib b/sources.bib index 8dc1f69..08707bf 100644 --- a/sources.bib +++ b/sources.bib @@ -137,3 +137,13 @@ primaryClass={cs.CV}, url={https://arxiv.org/abs/2204.07305}, } + +@misc{peng2023sgvaclipsemanticguidedvisualadapting, + title={SgVA-CLIP: Semantic-guided Visual Adapting of Vision-Language Models for Few-shot Image Classification}, + author={Fang Peng and Xiaoshan Yang and Linhui Xiao and Yaowei Wang and Changsheng Xu}, + year={2023}, + eprint={2211.16191}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2211.16191}, +}