Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models
Paper
•
2506.05176
•
Published
•
78
This is an INT8 dynamically quantized ONNX version of Qwen/Qwen3-Reranker-0.6B.
Qwen3-Reranker-0.6B is a text reranking model that scores the relevance between a query and a document. This quantized version provides faster inference with reduced model size while maintaining comparable accuracy.
pip install optimum[onnxruntime] transformers
import torch
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer
# Load model and tokenizer
model = ORTModelForCausalLM.from_pretrained("thomasht86/Qwen3-Reranker-0.6B-int8-ONNX", use_cache=False)
tokenizer = AutoTokenizer.from_pretrained("thomasht86/Qwen3-Reranker-0.6B-int8-ONNX", fix_mistral_regex=True)
# Format input for reranking
SYSTEM_PROMPT = 'Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".'
def get_relevance_score(query: str, document: str, instruction: str = None) -> float:
if instruction is None:
instruction = "Given a web search query, retrieve relevant passages that answer the query"
user_content = f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_content},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
text += "<think>\n\n</think>\n\n"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs, use_cache=False)
logits = outputs.logits[:, -1, :]
# Get yes/no token probabilities
token_true_id = tokenizer.convert_tokens_to_ids("yes")
token_false_id = tokenizer.convert_tokens_to_ids("no")
true_false_logits = logits[:, [token_false_id, token_true_id]]
probs = torch.softmax(true_false_logits, dim=-1)
return probs[:, 1].item() # Probability of "yes" (relevant)
# Example
score = get_relevance_score(
query="What is the capital of France?",
document="Paris is the capital and largest city of France."
)
print(f"Relevance score: {score:.4f}")
The quantization script used to create this model is included in this repository as quantize_qwen3_reranker.py.
To reproduce:
# Install uv if needed: pip install uv
uv run quantize_qwen3_reranker.py --output-dir ./output
This model inherits the license from the base model: Apache 2.0
@article{qwen3embedding,
title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and others},
journal={arXiv preprint arXiv:2506.05176},
year={2025}
}