rexoscare/autocomplete-search-dataset
Viewer • Updated • 75.2k • 34
How to use lv12/sin-qac-model with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-classification", model="lv12/sin-qac-model", trust_remote_code=True) # Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("lv12/sin-qac-model", trust_remote_code=True, dtype="auto")A CNN+Transformer model for ranking query auto-completion suggestions.
This model scores how well a candidate completion matches a given query prefix. It uses:
The model uses pretrained ByT5 byte-level embeddings for robust character-level understanding.
pip install transformers torch
from transformers import AutoTokenizer, AutoConfig, AutoModel
# Load model (trust_remote_code required for custom architecture)
config = AutoConfig.from_pretrained("lv12/sin-qac-model", trust_remote_code=True)
model = AutoModel.from_pretrained("lv12/sin-qac-model", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
import torch
def score_completion(model, tokenizer, prefix: str, candidate: str, max_length: int = 20):
"""Score how well a candidate matches a prefix."""
model.eval()
prefix_encoding = tokenizer(
prefix,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
candidate_encoding = tokenizer(
candidate,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
with torch.no_grad():
score = model(
prefix_ids=prefix_encoding["input_ids"],
candidate_ids=candidate_encoding["input_ids"]
)
return score.squeeze().item()
# Example usage
prefix = "how to"
candidates = ["how to cook pasta", "how to learn python", "weather today"]
scores = []
for candidate in candidates:
score = score_completion(model, tokenizer, prefix, candidate)
scores.append((candidate, score))
# Sort by score (higher is better match)
scores.sort(key=lambda x: x[1], reverse=True)
for candidate, score in scores:
print(f"{score:.4f} - {candidate}")
def batch_score(model, tokenizer, prefix: str, candidates: list, max_length: int = 20):
"""Score multiple candidates efficiently."""
model.eval()
prefix_encoding = tokenizer(
prefix,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
prefix_ids = prefix_encoding["input_ids"]
candidate_encodings = tokenizer(
candidates,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
scores = []
with torch.no_grad():
for i in range(len(candidates)):
score = model(
prefix_ids=prefix_ids,
candidate_ids=candidate_encodings["input_ids"][i:i+1]
)
scores.append(score.squeeze().item())
return list(zip(candidates, scores))
# Example
results = batch_score(model, tokenizer, "best resta", [
"best restaurants near me",
"best restaurant in new york",
"best resume templates",
"weather forecast"
])
for candidate, score in sorted(results, key=lambda x: -x[1]):
print(f"{score:.4f} - {candidate}")
final.ckptThe model outputs scores between 0 and 1:
If you use this model, please cite:
@misc{query-completion-model,
title={Query Auto-Completion Model},
year={2024},
publisher={HuggingFace},
url={https://huggingface.co/lv12/sin-qac-model}
}
Base model
google/byt5-small