File: compare-embeddings-logits.sh

package info (click to toggle)
llama.cpp 7593%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 71,012 kB
  • sloc: cpp: 329,391; ansic: 48,249; python: 32,103; lisp: 10,053; sh: 6,070; objc: 1,349; javascript: 924; xml: 384; makefile: 233
file content (84 lines) | stat: -rwxr-xr-x 2,217 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env bash

set -e

# Parse command line arguments
MODEL_PATH=""
MODEL_NAME=""
PROMPTS_FILE=""

# First argument is always model path
if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
    MODEL_PATH="$1"
    shift
fi

# Parse remaining arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --prompts-file|-pf)
            PROMPTS_FILE="$2"
            shift 2
            ;;
        *)
            # If MODEL_NAME not set and this isn't a flag, use as model name
            if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
                MODEL_NAME="$1"
            fi
            shift
            ;;
    esac
done

# Set defaults
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"

CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"

if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
else
    # Process piped JSON data and convert to binary (matching logits.cpp format)
    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
    python3 -c "
import json
import sys
import struct

data = json.load(sys.stdin)

# Flatten all embeddings completely
flattened = []
for item in data:
    embedding = item['embedding']
    for token_embedding in embedding:
        flattened.extend(token_embedding)

print(f'Total embedding values: {len(flattened)}', file=sys.stderr)

# Write as binary floats - matches logitc.cpp fwrite format
with open('$TEMP_FILE', 'wb') as f:
    for value in flattened:
        f.write(struct.pack('f', value))
"
    CPP_EMBEDDINGS="$TEMP_FILE"
    trap "rm -f $TEMP_FILE" EXIT
fi

# Build the semantic_check.py command
SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
    --cpp-embeddings $CPP_EMBEDDINGS"

# Add prompts file if specified, otherwise use default prompt
if [ -n "$PROMPTS_FILE" ]; then
    SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
else
    SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
fi

# Execute the command
eval $SEMANTIC_CMD