## Installing corresponding python packages & downloading model weights
# !pip install -r qwen_requirements.txt
# !cd model_weights
# !git lfs install
# !git clone https://hf-mirror.com/Qwen/Qwen-VL-Chat
preparing work
base vqa model
from transformers import AutoModelForCausalLM, AutoTokenizer
# Loading model from hugging face
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
# Loading modle from local path
= '/model_weights/Qwen-VL-Chat'
model_path = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda", trust_remote_code=True).eval() model
/home/ll_25113060022/anaconda3/envs/undergraduates_qwenvl/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
/home/ll_25113060022/anaconda3/envs/undergraduates_qwenvl/lib/python3.10/site-packages/transformers/utils/generic.py:260: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
torch.utils._pytree._register_pytree_node(
/home/ll_25113060022/anaconda3/envs/undergraduates_qwenvl/lib/python3.10/site-packages/transformers/utils/generic.py:260: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
torch.utils._pytree._register_pytree_node(
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 10/10 [00:18<00:00, 1.90s/it]
from PIL import Image
import pandas as pd
= '/cpfs01/projects-HDD/cfff-906dc71fafda_HDD/ll_25113060022/undergraduates_course_copy/hallucination/dataset/img_jpgs/'
img_path_head = '/cpfs01/projects-HDD/cfff-906dc71fafda_HDD/ll_25113060022/undergraduates_course_copy/hallucination/dataset/vqa_info.csv'
dataset_csv_path = pd.read_csv(dataset_csv_path)
df 10,] df.iloc[:
img_path | question | answer | |
---|---|---|---|
0 | img_0.jpg | is there evidence of an aortic aneurysm? | yes |
1 | img_1.jpg | is there airspace consolidation on the left side? | yes |
2 | img_10.jpg | is there any intraparenchymal abnormalities in... | no |
3 | img_100.jpg | which side of the heart border is obscured? | right |
4 | img_101.jpg | where are the kidney? | not seen here |
5 | img_102.jpg | are the kidneys present in this image? | no |
6 | img_103.jpg | is the colon more prominent on the patient's r... | left |
7 | img_104.jpg | where is the colon most prominent from this view? | left |
8 | img_105.jpg | is the heart size in this image smaller or lar... | smaller |
9 | img_106.jpg | is the liver visible in the image? | no |
= 262, 384
index_1, index_2
= img_path_head+df['img_path'][index_1]
image_path_1 = df['question'][index_1], df['answer'][index_1]
question_1, answer_1 = '<img>{}</img>{} Answer:'.format(image_path_1, question_1)
question_1_
= 'left'
tokenizer.padding_side = tokenizer.eod_id
tokenizer.pad_token_id = tokenizer([question_1_], return_tensors='pt', padding='longest')
input_ids_1
#########################################################################################################################################
= img_path_head+df['img_path'][index_2]
image_path_2 = df['question'][index_2], df['answer'][index_2]
question_2, answer_2 = '<img>{}</img>{} Answer:'.format(image_path_2, question_2)
question_2_
= 'left'
tokenizer.padding_side = tokenizer.eod_id
tokenizer.pad_token_id = tokenizer([question_2_], return_tensors='pt', padding='longest')
input_ids_2
##########################################################################################################################################
import matplotlib.pyplot as plt
= Image.open(image_path_1).convert("RGB")
image_1 = Image.open(image_path_2).convert("RGB")
image_2
= plt.subplots(1, 2, figsize=(15, 6))
fig, axes 0].imshow(image_1)
axes[0].set_title(f"image_1\nQuestion: {question_1}\nAnswer: {answer_1}")
axes[0].axis('off')
axes[1].imshow(image_2)
axes[1].set_title(f"image_2\nQuestion: {question_2}\nAnswer: {answer_2}")
axes[1].axis('off')
axes[
plt.tight_layout() plt.show()
= model.generate(
outputs_1 =input_ids_1.input_ids.cuda(),
input_ids=input_ids_1.attention_mask.cuda(),
attention_mask=False,
do_sample=20,
max_new_tokens=1,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=True,
output_hidden_states=True,
use_cache=tokenizer.eod_id,
pad_token_id=tokenizer.eod_id,
eos_token_id=1.0,
temperature=1,
top_p=None,
top_k=True, # 关键:启用分数输出
output_scores=True # 关键:返回结构化结果
return_dict_in_generate
)
= [
predicting_answer_1 1):].cpu(),
tokenizer.decode(_[input_ids_1.input_ids.size(=True).strip() for _ in outputs_1.sequences
skip_special_tokens0].strip()
][print(f"{'*'*100}\nQuestion1: {question_1}\nAnswer1: {answer_1}\nPredict1: {predicting_answer_1}")
#############################################################################################
= model.generate(
outputs_2 =input_ids_2.input_ids.cuda(),
input_ids=input_ids_2.attention_mask.cuda(),
attention_mask=True,
do_sample=20,
max_new_tokens=1,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=True,
output_hidden_states=True,
use_cache=tokenizer.eod_id,
pad_token_id=tokenizer.eod_id,
eos_token_id=1.0,
temperature=1,
top_p=None,
top_k=True, # 关键:启用分数输出
output_scores=True # 关键:返回结构化结果
return_dict_in_generate
)
= [
predicting_answer_2 1):].cpu(),
tokenizer.decode(_[input_ids_2.input_ids.size(=True).strip() for _ in outputs_2.sequences
skip_special_tokens0].strip()
][print(f"{'*'*100}\nQuestion2: {question_2}\nAnswer2: {answer_2}\nPredict2: {predicting_answer_2}")
/home/ll_25113060022/anaconda3/envs/undergraduates_qwenvl/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:377: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
warnings.warn(
****************************************************************************************************
Question1: which ventricle is the mass located in?
Answer1: 4th ventricle
Predict1: fourth ventricle
****************************************************************************************************
Question2: this image is consistent with what condition?
Answer2: appendicitis
Predict2: hydronephrosis
hallucination detection
import torch
import numpy as np
def softmax_1(x):
# 避免数值溢出:减去最大值(沿第二个维度)
= np.max(x, axis=1, keepdims=True)
x_max = np.exp(x - x_max)
x_exp
# 计算Softmax分母(沿第二个维度求和)
= np.sum(x_exp, axis=1, keepdims=True)
sum_exp
# 计算Softmax结果
return x_exp / sum_exp
def compute_entropy(probabilities):
"""
计算Softmax概率分布的熵
参数:
probabilities: 形状为(n, 3200)的numpy数组,表示概率分布
返回:
形状为(n, 1)的numpy数组,每行包含该行概率分布的熵值
"""
# 计算对数概率(避免log(0)导致NaN)
= np.zeros_like(probabilities)
log_probs = probabilities > 0
non_zero_mask = np.log(probabilities[non_zero_mask])
log_probs[non_zero_mask]
# 计算每行的熵:H = -Σ(p_i * log(p_i))
= -np.sum(probabilities * log_probs, axis=1, keepdims=True)
entropy_per_row
return entropy_per_row
= torch.cat(outputs_1.scores, dim=0).float().cpu().detach().numpy()
logits = softmax_1(logits)
prob = np.max(prob, axis=1)
prob_max = np.mean(prob_max, axis=0)
average_prob = np.max(prob_max, axis=0)
maximum_prob
= compute_entropy(prob)
entropy = np.mean(entropy, axis=0)[0]
average_entropy = np.max(entropy, axis=0)[0]
maximum_entropy
print(f"{'*'*50}\nhallucination metirc of outputs_1:")
print(f"average probability = {average_prob}\nmaximum probability = {maximum_prob}")
print(f"average entropy = {average_entropy}\nmaximum entropy = {maximum_entropy}")
##############################################################################################
= torch.cat(outputs_2.scores, dim=0).float().cpu().detach().numpy()
logits = softmax_1(logits)
prob = np.max(prob, axis=1)
prob_max = np.mean(prob_max, axis=0)
average_prob = np.max(prob_max, axis=0)
maximum_prob
= compute_entropy(prob)
entropy = np.mean(entropy, axis=0)[0]
average_entropy = np.max(entropy, axis=0)[0]
maximum_entropy print(f"{'*'*50}\nhallucination metirc of outputs_2:")
print(f"average probability = {average_prob}\nmaximum probability = {maximum_prob}")
print(f"average entropy = {average_entropy}\nmaximum entropy = {maximum_entropy}")
**************************************************
hallucination metirc of outputs_1:
average probability = 0.8835902214050293
maximum probability = 0.9999891519546509
average entropy = 0.4122808873653412
maximum entropy = 1.906251072883606
**************************************************
hallucination metirc of outputs_2:
average probability = 0.811636745929718
maximum probability = 0.999339759349823
average entropy = 1.2919089794158936
maximum entropy = 6.320765495300293
visual contrasive decoding for hallucination mitigation
reference: Leng, et al. Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding. CVPR, 2024
import sys
'/cpfs01/projects-HDD/cfff-906dc71fafda_HDD/ll_25113060022/undergraduates_course_copy/hallucination/advanced/Qwen_VL')
sys.path.append('/cpfs01/projects-HDD/cfff-906dc71fafda_HDD/ll_25113060022/undergraduates_course_copy/hallucination/advanced/vcd_utils')
sys.path.append(
from transformers import AutoTokenizer
from Qwen_VL.modeling_qwen import QWenLMHeadModel
from vcd_utils.vcd_add_noise import add_diffusion_noise
from vcd_utils.vcd_sample import evolve_vcd_sampling
evolve_vcd_sampling()
= '/model_weights/Qwen-VL-Chat'
model_path = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = QWenLMHeadModel.from_pretrained(
model_vcd
model_path,="cuda",
device_map=True
trust_remote_codeeval() ).
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 10/10 [00:15<00:00, 1.52s/it]
=7
index_3
= img_path_head+df['img_path'][index_3]
image_path = Image.open(image_path).convert("RGB")
image = model.transformer.visual.image_transform(image).unsqueeze(0).to(model.device)
image_tensor = add_diffusion_noise(image_tensor, noise_step=500)
image_tensor_cd
= df['question'][index_3], df['answer'][index_3]
question, answer = '<img>{}</img>{} Answer:'.format(image_path, question)
question_
= 'left'
tokenizer.padding_side = tokenizer.eod_id
tokenizer.pad_token_id = tokenizer([question_], return_tensors='pt', padding='longest')
input_ids
##########################################################################################################################################
import matplotlib.pyplot as plt
= plt.subplots(1, 1, figsize=(7, 6))
fig, axes
axes.imshow(image)f"Question: {question}\nAnswer: {answer}")
axes.set_title('off')
axes.axis(
plt.tight_layout() plt.show()
= model.generate(
outputs =input_ids.input_ids.cuda(),
input_ids=input_ids.attention_mask.cuda(),
attention_mask=True,
do_sample=20,
max_new_tokens=1,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=True,
output_hidden_states=True,
use_cache=tokenizer.eod_id,
pad_token_id=tokenizer.eod_id,
eos_token_id=1.0,
temperature=1,
top_p=None,
top_k=True, # 关键:启用分数输出
output_scores=True # 关键:返回结构化结果
return_dict_in_generate
)
= [
predicting_answer 1):].cpu(),
tokenizer.decode(_[input_ids.input_ids.size(=True).strip() for _ in outputs.sequences
skip_special_tokens0].strip()
][
= model_vcd.generate(
outputs_vcd =input_ids.input_ids.cuda(),
input_ids=input_ids.attention_mask.cuda(),
attention_mask=True,
do_sample=20,
max_new_tokens=1,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=True,
output_hidden_states=True,
use_cache=tokenizer.eod_id,
pad_token_id=tokenizer.eod_id,
eos_token_id=1.0,
temperature=1,
top_p=None,
top_k= image_tensor,
images =image_tensor_cd,
images_cd= 0.1,
cd_beta = 1,
cd_alpha =True, # 关键:启用分数输出
output_scores=True # 关键:返回结构化结果
return_dict_in_generate
)= [
predicting_answer_cd 1):].cpu(),
tokenizer.decode(_[input_ids.input_ids.size(=True).strip() for _ in outputs_vcd.sequences
skip_special_tokens0].strip()
][print(f"{'*'*100}\nQuestion: {question}\nAnswer: {answer}\nPredict: {predicting_answer}\nPredict_cvd: {predicting_answer_cd}")
****************************************************************************************************
Question: where is the colon most prominent from this view?
Answer: left
Predict: right
Predict_cvd: left
= torch.cat(outputs.scores, dim=0).float().cpu().detach().numpy()
logits = softmax_1(logits)
prob = np.max(prob, axis=1)
prob_max = np.mean(prob_max, axis=0)
average_prob = np.max(prob_max, axis=0)
maximum_prob
= compute_entropy(prob)
entropy = np.mean(entropy, axis=0)[0]
average_entropy = np.max(entropy, axis=0)[0]
maximum_entropy
print(f"{'*'*50}\nhallucination metirc of outputs:")
print(f"average probability = {average_prob}\nmaximum probability = {maximum_prob}")
print(f"average entropy = {average_entropy}\nmaximum entropy = {maximum_entropy}")
###########################################################################################
= torch.cat(outputs_vcd.scores, dim=0).float().cpu().detach().numpy()
logits = softmax_1(logits)
prob = np.max(prob, axis=1)
prob_max = np.mean(prob_max, axis=0)
average_prob = np.max(prob_max, axis=0)
maximum_prob
= compute_entropy(prob)
entropy = np.mean(entropy, axis=0)[0]
average_entropy = np.max(entropy, axis=0)[0]
maximum_entropy
print(f"{'*'*50}\nhallucination metirc of outputs_cd:")
print(f"average probability = {average_prob}\nmaximum probability = {maximum_prob}")
print(f"average entropy = {average_entropy}\nmaximum entropy = {maximum_entropy}")
**************************************************
hallucination metirc of outputs:
average probability = 0.7589143514633179
maximum probability = 0.8660995364189148
average entropy = 1.211894154548645
maximum entropy = 1.8057494163513184
**************************************************
hallucination metirc of outputs_cd:
average probability = 0.969956636428833
maximum probability = 1.0
average entropy = 0.11360295116901398
maximum entropy = 0.22720590233802795
Reuse
Citation
For attribution, please cite this work as:
Li, Zeju. n.d. “Preparing Work.” https://zerojumpline.github.io//teaching/2025-08-15-Undergraduate
Project/hallucination.html.