Is there a way to do the sampling externally, not in llamacpp? I wanted to play with stupid sampling strategies but the below results in low generation speed.
import httpx
import asyncio
client_main = httpx.AsyncClient()
client_unslop = httpx.AsyncClient()
last_response=None
async def get_logits(prompt, client, num_logits=100, tokens=1, endpoint="http://localhost:8080/completion"):
data = {
"prompt": prompt,
"max_tokens": tokens,
"temperature": 0,
'n_probs': num_logits,
'min_keep': num_logits,
}
response = await client.post(endpoint, json=data)
response = response.json()
global last_response
last_response = response
text, probs = response['content'], response['completion_probabilities']
return text, probs
async def sample_sequence(prompt="Once upon a time",num_tokens=10,top_logits=100,endpoint="http://localhost:8080/completion"):
for token in range(num_tokens):
_, probs = await get_logits(prompt,client_main,num_logits=top_logits,endpoint=endpoint)
probs = softmax({token['token']:token['logprob'] for token in probs[0]['top_logprobs']})
sampled = list(probs.keys())[0]
prompt += sampled
yield sampled
async for result in ( sample_sequence(prompt='Here is a proof that',endpoint="http://localhost:8080/completion", num_tokens=500)):
print(result, end='')