File: system-file-format.texi

package info (click to toggle)
pspp 2.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 66,676 kB
  • sloc: ansic: 267,210; xml: 18,446; sh: 5,534; python: 2,881; makefile: 125; perl: 64
file content (1784 lines) | stat: -rw-r--r-- 57,360 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
@c PSPP - a program for statistical analysis.
@c Copyright (C) 2019 Free Software Foundation, Inc.
@c Permission is granted to copy, distribute and/or modify this document
@c under the terms of the GNU Free Documentation License, Version 1.3
@c or any later version published by the Free Software Foundation;
@c with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
@c A copy of the license is included in the section entitled "GNU
@c Free Documentation License".
@c

@node System File Format
@chapter System File Format

An SPSS system file holds a set of cases and dictionary information
that describes how they may be interpreted.  The system file format
dates back 40+ years and has evolved greatly over that time to support
new features, but in a way to facilitate interchange between even the
oldest and newest versions of software.  This chapter describes the
system file format.

System files use four data types: 8-bit characters, 32-bit integers,
64-bit integers,
and 64-bit floating points, called here @code{char}, @code{int32},
@code{int64}, and
@code{flt64}, respectively.  Data is not necessarily aligned on a word
or double-word boundary: the long variable name record (@pxref{Long
Variable Names Record}) and very long string records (@pxref{Very Long
String Record}) have arbitrary byte length and can therefore cause all
data coming after them in the file to be misaligned.

Integer data in system files may be big-endian or little-endian.  A
reader may detect the endianness of a system file by examining
@code{layout_code} in the file header record
(@pxref{layout_code,,@code{layout_code}}).

Floating-point data in system files may nominally be in IEEE 754, IBM,
or VAX formats.  A reader may detect the floating-point format in use
by examining @code{bias} in the file header record
(@pxref{bias,,@code{bias}}).

PSPP detects big-endian and little-endian integer formats in system
files and translates as necessary.  PSPP also detects the
floating-point format in use, as well as the endianness of IEEE 754
floating-point numbers, and translates as needed.  However, only IEEE
754 numbers with the same endianness as integer data in the same file
have actually been observed in system files, and it is likely that
other formats are obsolete or were never used.

System files use a few floating point values for special purposes:

@table @asis
@item SYSMIS
The system-missing value is represented by the largest possible
negative number in the floating point format (@code{-DBL_MAX}).

@item HIGHEST
HIGHEST is used as the high end of a missing value range with an
unbounded maximum.  It is represented by the largest possible positive
number (@code{DBL_MAX}).

@item LOWEST
LOWEST is used as the low end of a missing value range with an
unbounded minimum.  It was originally represented by the
second-largest negative number (in IEEE 754 format,
@code{0xffeffffffffffffe}).  System files written by SPSS 21 and later
instead use the largest negative number (@code{-DBL_MAX}), the same
value as SYSMIS.  This does not lead to ambiguity because LOWEST
appears in system files only in missing value ranges, which never
contain SYSMIS.
@end table

System files may use most character encodings based on an 8-bit unit.
UTF-16 and UTF-32, based on wider units, appear to be unacceptable.
@code{rec_type} in the file header record is sufficient to distinguish
between ASCII and EBCDIC based encodings.  The best way to determine
the specific encoding in use is to consult the character encoding
record (@pxref{Character Encoding Record}), if present, and failing
that the @code{character_code} in the machine integer info record
(@pxref{Machine Integer Info Record}).  The same encoding should be
used for the dictionary and the data in the file, although it is
possible to artificially synthesize files that use different encodings
(@pxref{Character Encoding Record}).

@menu
* System File Record Structure::
* File Header Record::
* Variable Record::
* Value Labels Records::
* Document Record::
* Machine Integer Info Record::
* Machine Floating-Point Info Record::
* Multiple Response Sets Records::
* Extra Product Info Record::
* Variable Display Parameter Record::
* Variable Sets Record::
* Long Variable Names Record::
* Very Long String Record::
* Character Encoding Record::
* Long String Value Labels Record::
* Long String Missing Values Record::
* Data File and Variable Attributes Records::
* Extended Number of Cases Record::
* Other Informational Records::
* Dictionary Termination Record::
* Data Record::
@end menu

@node System File Record Structure
@section System File Record Structure

System files are divided into records with the following format:

@example
int32               type;
char                data[];
@end example

This header does not identify the length of the @code{data} or any
information about what it contains, so the system file reader must
understand the format of @code{data} based on @code{type}.  However,
records with type 7, called @dfn{extension records}, have a stricter
format:

@example
int32               type;
int32               subtype;
int32               size;
int32               count;
char                data[size * count];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  This value identifies a particular kind of extension
record.

@item int32 size;
The size of each piece of data that follows the header, in bytes.
Known extension records use 1, 4, or 8, for @code{char}, @code{int32},
and @code{flt64} format data, respectively.

@item int32 count;
The number of pieces of data that follow the header.

@item char data[size * count];
Data, whose format and interpretation depend on the subtype.
@end table

An extension record contains exactly @code{size * count} bytes of
data, which allows a reader that does not understand an extension
record to skip it.  Extension records provide only nonessential
information, so this allows for files written by newer software to
preserve backward compatibility with older or less capable readers.

Records in a system file must appear in the following order:

@itemize @bullet
@item
File header record.

@item
Variable records.

@item
All pairs of value labels records and value label variables records,
if present.

@item
Document record, if present.

@item
Extension (type 7) records, in ascending numerical order of their
subtypes.

System files written by SPSS include at most one of each kind of
extension record.  This is generally true of system files written by
other software as well, with known exceptions noted below in the
individual sections about each type of record.

@item
Dictionary termination record.

@item
Data record.
@end itemize

We advise authors of programs that read system files to tolerate
format variations.  Various kinds of misformatting and corruption have
been observed in system files written by SPSS and other software
alike.  In particular, because extension records provide nonessential
information, it is generally better to ignore an extension record
entirely than to refuse to read a system file.

The following sections describe the known kinds of records.

@node File Header Record
@section File Header Record

A system file begins with the file header, with the following format:

@example
char                rec_type[4];
char                prod_name[60];
int32               layout_code;
int32               nominal_case_size;
int32               compression;
int32               weight_index;
int32               ncases;
flt64               bias;
char                creation_date[9];
char                creation_time[8];
char                file_label[64];
char                padding[3];
@end example

@table @code
@item char rec_type[4];
Record type code, either @samp{$FL2} for system files with
uncompressed data or data compressed with simple bytecode compression,
or @samp{$FL3} for system files with ZLIB compressed data.

This is truly a character field that uses the character encoding as
other strings.  Thus, in a file with an ASCII-based character encoding
this field contains @code{24 46 4c 32} or @code{24 46 4c 33}, and in a
file with an EBCDIC-based encoding this field contains @code{5b c6 d3
f2}.  (No EBCDIC-based ZLIB-compressed files have been observed.)

@item char prod_name[60];
Product identification string.  This always begins with the characters
@samp{@@(#) SPSS DATA FILE}.  PSPP uses the remaining characters to
give its version and the operating system name; for example, @samp{GNU
pspp 0.1.4 - sparc-sun-solaris2.5.2}.  The string is truncated if it
would be longer than 60 characters; otherwise it is padded on the right
with spaces.

The product name field allow readers to behave differently based on
quirks in the way that particular software writes system files.
@xref{Value Labels Records}, for the detail of the quirk that the PSPP
system file reader tolerates in files written by ReadStat, which has
@code{https://github.com/WizardMac/ReadStat} in @code{prod_name}.

@anchor{layout_code}
@item int32 layout_code;
Normally set to 2, although a few system files have been spotted in
the wild with a value of 3 here.  PSPP use this value to determine the
file's integer endianness (@pxref{System File Format}).

@item int32 nominal_case_size;
Number of data elements per case.  This is the number of variables,
except that long string variables add extra data elements (one for every
8 characters after the first 8).  However, string variables do not
contribute to this value beyond the first 255 bytes.   Further, some
software always writes -1 or 0 in this field.  In general, it is
unsafe for systems reading system files to rely upon this value.

@item int32 compression;
Set to 0 if the data in the file is not compressed, 1 if the data is
compressed with simple bytecode compression, 2 if the data is ZLIB
compressed.  This field has value 2 if and only if @code{rec_type} is
@samp{$FL3}.

@item int32 weight_index;
If one of the variables in the data set is used as a weighting
variable, set to the dictionary index of that variable, plus 1
(@pxref{Dictionary Index}).  Otherwise, set to 0.

@item int32 ncases;
Set to the number of cases in the file if it is known, or -1 otherwise.

In the general case it is not possible to determine the number of cases
that will be output to a system file at the time that the header is
written.  The way that this is dealt with is by writing the entire
system file, including the header, then seeking back to the beginning of
the file and writing just the @code{ncases} field.  For files in which
this is not valid, the seek operation fails.  In this case,
@code{ncases} remains -1.

@anchor{bias}
@item flt64 bias;
Compression bias, ordinarily set to 100.  Only integers between
@code{1 - bias} and @code{251 - bias} can be compressed.

By assuming that its value is 100, PSPP uses @code{bias} to determine
the file's floating-point format and endianness (@pxref{System File
Format}).  If the compression bias is not 100, PSPP cannot auto-detect
the floating-point format and assumes that it is IEEE 754 format with
the same endianness as the system file's integers, which is correct
for all known system files.

@item char creation_date[9];
Date of creation of the system file, in @samp{dd mmm yy}
format, with the month as standard English abbreviations, using an
initial capital letter and following with lowercase.  If the date is not
available then this field is arbitrarily set to @samp{01 Jan 70}.

@item char creation_time[8];
Time of creation of the system file, in @samp{hh:mm:ss}
format and using 24-hour time.  If the time is not available then this
field is arbitrarily set to @samp{00:00:00}.

@item char file_label[64];
File label declared by the user, if any (@pxref{FILE LABEL,,,pspp,
PSPP Users Guide}).  Padded on the right with spaces.

A product that identifies itself as @code{VOXCO INTERVIEWER 4.3} uses
CR-only line ends in this field, rather than the more usual LF-only or
CR LF line ends.

@item char padding[3];
Ignored padding bytes to make the structure a multiple of 32 bits in
length.  Set to zeros.
@end table

@node Variable Record
@section Variable Record

There must be one variable record for each numeric variable and each
string variable with width 8 bytes or less.  String variables wider
than 8 bytes have one variable record for each 8 bytes, rounding up.
The first variable record for a long string specifies the variable's
correct dictionary information.  Subsequent variable records for a
long string are filled with dummy information: a type of -1, no
variable label or missing values, print and write formats that are
ignored, and an empty string as name.  A few system files have been
encountered that include a variable label on dummy variable records,
so readers should take care to parse dummy variable records in the
same way as other variable records.

@anchor{Dictionary Index}
The @dfn{dictionary index} of a variable is a 1-based offset in the set of
variable records, including dummy variable records for long string
variables.  The first variable record has a dictionary index of 1, the
second has a dictionary index of 2, and so on.

The system file format does not directly support string variables
wider than 255 bytes.  Such very long string variables are represented
by a number of narrower string variables.  @xref{Very Long String
Record}, for details.

A system file should contain at least one variable and thus at least
one variable record, but system files have been observed in the wild
without any variables (thus, no data either).

@example
int32               rec_type;
int32               type;
int32               has_var_label;
int32               n_missing_values;
int32               print;
int32               write;
char                name[8];

/* @r{Present only if @code{has_var_label} is 1.} */
int32               label_len;
char                label[];

/* @r{Present only if @code{n_missing_values} is nonzero}. */
flt64               missing_values[];
@end example

@table @code
@item int32 rec_type;
Record type code.  Always set to 2.

@item int32 type;
Variable type code.  Set to 0 for a numeric variable.  For a short
string variable or the first part of a long string variable, this is set
to the width of the string.  For the second and subsequent parts of a
long string variable, set to -1, and the remaining fields in the
structure are ignored.

@item int32 has_var_label;
If this variable has a variable label, set to 1; otherwise, set to 0.

@item int32 n_missing_values;
If the variable has no missing values, set to 0.  If the variable has
one, two, or three discrete missing values, set to 1, 2, or 3,
respectively.  If the variable has a range for missing variables, set to
-2; if the variable has a range for missing variables plus a single
discrete value, set to -3.

A long string variable always has the value 0 here.  A separate record
indicates missing values for long string variables (@pxref{Long String
Missing Values Record}).

@item int32 print;
Print format for this variable.  See below.

@item int32 write;
Write format for this variable.  See below.

@item char name[8];
Variable name.  The variable name must begin with a capital letter or
the at-sign (@samp{@@}).  Subsequent characters may also be digits, octothorpes
(@samp{#}), dollar signs (@samp{$}), underscores (@samp{_}), or full
stops (@samp{.}).  The variable name is padded on the right with spaces.

The @samp{name} fields should be unique within a system file.  System
files written by SPSS that contain very long string variables with
similar names sometimes contain duplicate names that are later
eliminated by resolving the very long string names (@pxref{Very Long
String Record}).  PSPP handles duplicates by assigning them new,
unique names.

@item int32 label_len;
This field is present only if @code{has_var_label} is set to 1.  It is
set to the length, in characters, of the variable label.  The
documented maximum length varies from 120 to 255 based on SPSS
version, but some files have been seen with longer labels.  PSPP
accepts labels of any length.

@item char label[];
This field is present only if @code{has_var_label} is set to 1.  It has
length @code{label_len}, rounded up to the nearest multiple of 32 bits.
The first @code{label_len} characters are the variable's variable label.

@item flt64 missing_values[];
This field is present only if @code{n_missing_values} is nonzero.  It
has the same number of 8-byte elements as the absolute value of
@code{n_missing_values}.  Each element is interpreted as a number for
numeric variables (with HIGHEST and LOWEST indicated as described in
the chapter introduction).  For string variables of width less than 8
bytes, elements are right-padded with spaces; for string variables
wider than 8 bytes, only the first 8 bytes of each missing value are
specified, with the remainder implicitly all spaces.

For discrete missing values, each element represents one missing
value.  When a range is present, the first element denotes the minimum
value in the range, and the second element denotes the maximum value
in the range.  When a range plus a value are present, the third
element denotes the additional discrete missing value.
@end table

@anchor{System File Output Formats}
The @code{print} and @code{write} members of sysfile_variable are output
formats coded into @code{int32} types.  The least-significant byte
of the @code{int32} represents the number of decimal places, and the
next two bytes in order of increasing significance represent field width
and format type, respectively.  The most-significant byte is not
used and should be set to zero.

Format types are defined as follows:

@quotation
@multitable {Value} {@code{DATETIME}}
@headitem Value
@tab Meaning
@item 0
@tab Not used.
@item 1
@tab @code{A}
@item 2
@tab @code{AHEX}
@item 3
@tab @code{COMMA}
@item 4
@tab @code{DOLLAR}
@item 5
@tab @code{F}
@item 6
@tab @code{IB}
@item 7
@tab @code{PIBHEX}
@item 8
@tab @code{P}
@item 9
@tab @code{PIB}
@item 10
@tab @code{PK}
@item 11
@tab @code{RB}
@item 12
@tab @code{RBHEX}
@item 13
@tab Not used.
@item 14
@tab Not used.
@item 15
@tab @code{Z}
@item 16
@tab @code{N}
@item 17
@tab @code{E}
@item 18
@tab Not used.
@item 19
@tab Not used.
@item 20
@tab @code{DATE}
@item 21
@tab @code{TIME}
@item 22
@tab @code{DATETIME}
@item 23
@tab @code{ADATE}
@item 24
@tab @code{JDATE}
@item 25
@tab @code{DTIME}
@item 26
@tab @code{WKDAY}
@item 27
@tab @code{MONTH}
@item 28
@tab @code{MOYR}
@item 29
@tab @code{QYR}
@item 30
@tab @code{WKYR}
@item 31
@tab @code{PCT}
@item 32
@tab @code{DOT}
@item 33
@tab @code{CCA}
@item 34
@tab @code{CCB}
@item 35
@tab @code{CCC}
@item 36
@tab @code{CCD}
@item 37
@tab @code{CCE}
@item 38
@tab @code{EDATE}
@item 39
@tab @code{SDATE}
@item 40
@tab @code{MTIME}
@item 41
@tab @code{YMDHMS}
@end multitable
@end quotation

A few system files have been observed in the wild with invalid
@code{write} fields, in particular with value 0.  Readers should
probably treat invalid @code{print} or @code{write} fields as some
default format.

@node Value Labels Records
@section Value Labels Records

The value label records documented in this section are used for
numeric and short string variables only.  Long string variables may
have value labels, but their value labels are recorded using a
different record type (@pxref{Long String Value Labels Record}).

ReadStat (@pxref{File Header Record}) writes value labels that label a
single value more than once.  In more detail, it emits value labels
whose values are longer than string variables' widths, that are
identical in the actual width of the variable, e.g.@: labels for
values @code{ABC123} and @code{ABC456} for a string variable with
width 3.  For files written by this software, PSPP ignores such
labels.

The value label record has the following format:

@example
int32               rec_type;
int32               label_count;

/* @r{Repeated @code{n_label} times}. */
char                value[8];
char                label_len;
char                label[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 3.

@item int32 label_count;
Number of value labels present in this record.
@end table

The remaining fields are repeated @code{count} times.  Each
repetition specifies one value label.

@table @code
@item char value[8];
A numeric value or a short string value padded as necessary to 8 bytes
in length.  Its type and width cannot be determined until the
following value label variables record (see below) is read.

@item char label_len;
The label's length, in bytes.  The documented maximum length varies
from 60 to 120 based on SPSS version.  PSPP supports value labels up
to 255 bytes long.

@item char label[];
@code{label_len} bytes of the actual label, followed by up to 7 bytes
of padding to bring @code{label} and @code{label_len} together to a
multiple of 8 bytes in length.
@end table

The value label record is always immediately followed by a value label
variables record with the following format:

@example
int32               rec_type;
int32               var_count;
int32               vars[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 4.

@item int32 var_count;
Number of variables that the associated value labels from the value
label record are to be applied.

@item int32 vars[];
A list of 1-based dictionary indexes of variables to which to apply the value
labels (@pxref{Dictionary Index}).  There are @code{var_count}
elements.

String variables wider than 8 bytes may not be specified in this list.
@end table

@node Document Record
@section Document Record

The document record, if present, has the following format:

@example
int32               rec_type;
int32               n_lines;
char                lines[][80];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 6.

@item int32 n_lines;
Number of lines of documents present.  This should be greater than
zero, but ReadStats writes system files with zero @code{n_lines}.

@item char lines[][80];
Document lines.  The number of elements is defined by @code{n_lines}.
Lines shorter than 80 characters are padded on the right with spaces.
@end table

@node Machine Integer Info Record
@section Machine Integer Info Record

The integer info record, if present, has the following format:

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Data.} */
int32               version_major;
int32               version_minor;
int32               version_revision;
int32               machine_code;
int32               floating_point_rep;
int32               compression_code;
int32               endianness;
int32               character_code;
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 3.

@item int32 size;
Size of each piece of data in the data part, in bytes.  Always set to 4.

@item int32 count;
Number of pieces of data in the data part.  Always set to 8.

@item int32 version_major;
PSPP major version number.  In version @var{x}.@var{y}.@var{z}, this
is @var{x}.

@item int32 version_minor;
PSPP minor version number.  In version @var{x}.@var{y}.@var{z}, this
is @var{y}.

@item int32 version_revision;
PSPP version revision number.  In version @var{x}.@var{y}.@var{z},
this is @var{z}.

@item int32 machine_code;
Machine code.  PSPP always set this field to value to -1, but other
values may appear.

@item int32 floating_point_rep;
Floating point representation code.  For IEEE 754 systems this is 1.
IBM 370 sets this to 2, and DEC VAX E to 3.

@item int32 compression_code;
Compression code.  Always set to 1, regardless of whether or how the
file is compressed.

@item int32 endianness;
Machine endianness.  1 indicates big-endian, 2 indicates little-endian.

@item int32 character_code;
@anchor{character-code} Character code.  The following values have
been actually observed in system files:

@table @asis
@item 1
EBCDIC.

@item 2
7-bit ASCII.

@item 1250
The @code{windows-1250} code page for Central European and Eastern
European languages.

@item 1252
The @code{windows-1252} code page for Western European languages.

@item 28591
ISO 8859-1.

@item 65001
UTF-8.
@end table

The following additional values are known to be defined:

@table @asis
@item 3
8-bit ``ASCII''.

@item 4
DEC Kanji.
@end table

Other Windows code page numbers are known to be generally valid.

Old versions of SPSS for Unix and Windows always wrote value 2 in this
field, regardless of the encoding in use.  Newer versions also write
the character encoding as a string (see @ref{Character Encoding
Record}).
@end table

@node Machine Floating-Point Info Record
@section Machine Floating-Point Info Record

The floating-point info record, if present, has the following format:

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Data.} */
flt64               sysmis;
flt64               highest;
flt64               lowest;
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 4.

@item int32 size;
Size of each piece of data in the data part, in bytes.  Always set to 8.

@item int32 count;
Number of pieces of data in the data part.  Always set to 3.

@item flt64 sysmis;
@itemx flt64 highest;
@itemx flt64 lowest;
The system missing value, the value used for HIGHEST in missing
values, and the value used for LOWEST in missing values, respectively.
@xref{System File Format}, for more information.

The SPSSWriter library in PHP, which identifies itself as @code{FOM
SPSS 1.0.0} in the file header record @code{prod_name} field, writes
unexpected values to these fields, but it uses the same values
consistently throughout the rest of the file.
@end table

@node Multiple Response Sets Records
@section Multiple Response Sets Records

The system file format has two different types of records that
represent multiple response sets (@pxref{MRSETS,,,pspp, PSPP Users
Guide}).  The first type of record describes multiple response sets
that can be understood by SPSS before version 14.  The second type of
record, with a closely related format, is used for multiple dichotomy
sets that use the CATEGORYLABELS=COUNTEDVALUES feature added in
version 14.

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of data.} */
char                mrsets[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Set to 7 for records that describe multiple response
sets understood by SPSS before version 14, or to 19 for records that
describe dichotomy sets that use the CATEGORYLABELS=COUNTEDVALUES
feature added in version 14.

@item int32 size;
The size of each element in the @code{mrsets} member. Always set to 1.

@item int32 count;
The total number of bytes in @code{mrsets}.

@item char mrsets[];
Zero or more line feeds (byte 0x0a), followed by a series of multiple
response sets, each of which consists of the following:

@itemize @bullet
@item
The set's name (an identifier that begins with @samp{$}), in mixed
upper and lower case.

@item
An equals sign (@samp{=}).

@item
@samp{C} for a multiple category set, @samp{D} for a multiple
dichotomy set with CATEGORYLABELS=VARLABELS, or @samp{E} for a
multiple dichotomy set with CATEGORYLABELS=COUNTEDVALUES.

@item
For a multiple dichotomy set with CATEGORYLABELS=COUNTEDVALUES, a
space, followed by a number expressed as decimal digits, followed by a
space.  If LABELSOURCE=VARLABEL was specified on MRSETS, then the
number is 11; otherwise it is 1.@footnote{This part of the format may
not be fully understood, because only a single example of each
possibility has been examined.}

@item
For either kind of multiple dichotomy set, the counted value, as a
positive integer count specified as decimal digits, followed by a
space, followed by as many string bytes as specified in the count.  If
the set contains numeric variables, the string consists of the counted
integer value expressed as decimal digits.  If the set contains string
variables, the string contains the counted string value.  Either way,
the string may be padded on the right with spaces (older versions of
SPSS seem to always pad to a width of 8 bytes; newer versions don't).

@item
A space.

@item
The multiple response set's label, using the same format as for the
counted value for multiple dichotomy sets.  A string of length 0 means
that the set does not have a label.  A string of length 0 is also
written if LABELSOURCE=VARLABEL was specified.

@item
A space.

@item
The short names of the variables in the set, converted to lowercase,
each separated from the previous by a single space.

Even though a multiple response set must have at least two variables,
some system files contain multiple response sets with no variables or
one variable.  The source and meaning of these multiple response sets is
unknown.  (Perhaps they arise from creating a multiple response set
then deleting all the variables that it contains?)

@item
One line feed (byte 0x0a).  Sometimes multiple, even hundreds, of line
feeds are present.
@end itemize
@end table

Example: Given appropriate variable definitions, consider the
following MRSETS command:

@example
MRSETS /MCGROUP NAME=$a LABEL='my mcgroup' VARIABLES=a b c
       /MDGROUP NAME=$b VARIABLES=g e f d VALUE=55
       /MDGROUP NAME=$c LABEL='mdgroup #2' VARIABLES=h i j VALUE='Yes'
       /MDGROUP NAME=$d LABEL='third mdgroup' CATEGORYLABELS=COUNTEDVALUES
        VARIABLES=k l m VALUE=34
       /MDGROUP NAME=$e CATEGORYLABELS=COUNTEDVALUES LABELSOURCE=VARLABEL
        VARIABLES=n o p VALUE='choice'.
@end example

The above would generate the following multiple response set record of
subtype 7:

@example
$a=C 10 my mcgroup a b c
$b=D2 55 0  g e f d
$c=D3 Yes 10 mdgroup #2 h i j
@end example

It would also generate the following multiple response set record with
subtype 19:

@example
$d=E 1 2 34 13 third mdgroup k l m
$e=E 11 6 choice 0  n o p
@end example

@node Extra Product Info Record
@section Extra Product Info Record

This optional record appears to contain a text string that describes
the program that wrote the file and the source of the data.  (This is
redundant with the file label and product info found in the file
header record.)

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of data.} */
char                info[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 10.

@item int32 size;
The size of each element in the @code{info} member. Always set to 1.

@item int32 count;
The total number of bytes in @code{info}.

@item char info[];
A text string.  A product that identifies itself as @code{VOXCO
INTERVIEWER 4.3} uses CR-only line ends in this field, rather than the
more usual LF-only or CR LF line ends.
@end table

@node Variable Display Parameter Record
@section Variable Display Parameter Record

The variable display parameter record, if present, has the following
format:

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Repeated @code{count} times}. */
int32               measure;
int32               width;           /* @r{Not always present.} */
int32               alignment;
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 11.

@item int32 size;
The size of @code{int32}.  Always set to 4.

@item int32 count;
The number of sets of variable display parameters (ordinarily the
number of variables in the dictionary), times 2 or 3.
@end table

The remaining members are repeated @code{count} times, in the same
order as the variable records.  No element corresponds to variable
records that continue long string variables.  The meanings of these
members are as follows:

@table @code
@item int32 measure;
The measurement level of the variable:
@table @asis
@item 0
Unknown
@item 1
Nominal
@item 2
Ordinal
@item 3
Scale
@end table

An ``unknown'' @code{measure} of 0 means that the variable was created
in some way that doesn't make the measurement level clear, e.g.@: with
a @code{COMPUTE} transformation.  PSPP sets the measurement level the
first time it reads the data using the rules documented in
@ref{Measurement Level,,,pspp, PSPP Users Guide}, so this should
rarely appear.

@item int32 width;
The width of the display column for the variable in characters.

This field is present if @var{count} is 3 times the number of
variables in the dictionary.  It is omitted if @var{count} is 2 times
the number of variables.

@item int32 alignment;
The alignment of the variable for display purposes:

@table @asis
@item 0
Left aligned
@item 1
Right aligned
@item 2
Centre aligned
@end table
@end table

@node Variable Sets Record
@section Variable Sets Record

The SPSS GUI offers users the ability to arrange variables in sets.
Users may enable and disable sets individually, and the data editor
and analysis dialog boxes only show enabled sets.  Syntax does not use
variable sets.

The variable sets record, if present, has the following format:

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of text.} */
char                text[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 5.

@item int32 size;
Always set to 1.

@item int32 count;
The total number of bytes in @code{text}.

@item char text[];
The variable sets, in a text-based format.

Each variable set occupies one line of text, each of which ends with a
line feed (byte 0x0a), optionally preceded by a carriage return (byte
0x0d).

Each line begins with the name of the variable set, followed by an
equals sign (@samp{=}) and a space (byte 0x20), followed by the long
variable names of the members of the set, separated by spaces.  A
variable set may be empty, in which case the equals sign and the space
following it are still present.
@end table

@node Long Variable Names Record
@section Long Variable Names Record

If present, the long variable names record has the following format:

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of data.} */
char                var_name_pairs[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 13.

@item int32 size;
The size of each element in the @code{var_name_pairs} member. Always set to 1.

@item int32 count;
The total number of bytes in @code{var_name_pairs}.

@item char var_name_pairs[];
A list of @var{key}--@var{value} tuples, where @var{key} is the name
of a variable, and @var{value} is its long variable name.
The @var{key} field is at most 8 bytes long and must match the
name of a variable which appears in the variable record (@pxref{Variable
Record}).
The @var{value} field is at most 64 bytes long.
The @var{key} and @var{value} fields are separated by a @samp{=} byte.
Each tuple is separated by a byte whose value is 09.  There is no
trailing separator following the last tuple.
The total length is @code{count} bytes.
@end table

@node Very Long String Record
@section Very Long String Record

Old versions of SPSS limited string variables to a width of 255 bytes.
For backward compatibility with these older versions, the system file
format represents a string longer than 255 bytes, called a @dfn{very
long string}, as a collection of strings no longer than 255 bytes
each.  The strings concatenated to make a very long string are called
its @dfn{segments}; for consistency, variables other than very long
strings are considered to have a single segment.

A very long string with a width of @var{w} has @var{n} =
(@var{w} + 251) / 252 segments, that is, one segment for every
252 bytes of width, rounding up.  It would be logical, then, for each
of the segments except the last to have a width of 252 and the last
segment to have the remainder, but this is not the case.  In fact,
each segment except the last has a width of 255 bytes.  The last
segment has width @var{w} - (@var{n} - 1) * 252; some versions
of SPSS make it slightly wider, but not wide enough to make the last
segment require another 8 bytes of data.

Data is packed tightly into segments of a very long string, 255 bytes
per segment.  Because 255 bytes of segment data are allocated for
every 252 bytes of the very long string's width (approximately), some
unused space is left over at the end of the allocated segments.  Data
in unused space is ignored.

Example: Consider a very long string of width 20,000.  Such a very
long string has 20,000 / 252 = 80 (rounding up) segments.  The first
79 segments have width 255; the last segment has width 20,000 - 79 *
252 = 92 or slightly wider (up to 96 bytes, the next multiple of 8).
The very long string's data is actually stored in the 19,890 bytes in
the first 78 segments, plus the first 110 bytes of the 79th segment
(19,890 + 110 = 20,000).  The remaining 145 bytes of the 79th segment
and all 92 bytes of the 80th segment are unused.

The very long string record explains how to stitch together segments
to obtain very long string data.  For each of the very long string
variables in the dictionary, it specifies the name of its first
segment's variable and the very long string variable's actual width.
The remaining segments immediately follow the named variable in the
system file's dictionary.

The very long string record, which is present only if the system file
contains very long string variables, has the following format:

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of data.} */
char                string_lengths[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 14.

@item int32 size;
The size of each element in the @code{string_lengths} member. Always set to 1.

@item int32 count;
The total number of bytes in @code{string_lengths}.

@item char string_lengths[];
A list of @var{key}--@var{value} tuples, where @var{key} is the name
of a variable, and @var{value} is its length.
The @var{key} field is at most 8 bytes long and must match the
name of a variable which appears in the variable record (@pxref{Variable
Record}).
The @var{value} field is exactly 5 bytes long. It is a zero-padded,
ASCII-encoded string that is the length of the variable.
The @var{key} and @var{value} fields are separated by a @samp{=} byte.
Tuples are delimited by a two-byte sequence @{00, 09@}.
After the last tuple, there may be a single byte 00, or @{00, 09@}.
The total length is @code{count} bytes.
@end table

@node Character Encoding Record
@section Character Encoding Record

This record, if present, indicates the character encoding for string data,
long variable names, variable labels, value labels and other strings in the
file.

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of data.} */
char                encoding[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 20.

@item int32 size;
The size of each element in the @code{encoding} member. Always set to 1.

@item int32 count;
The total number of bytes in @code{encoding}.

@item char encoding[];
The name of the character encoding.  Normally this will be an official
IANA character set name or alias.
See @url{http://www.iana.org/assignments/character-sets}.
Character set names are not case-sensitive, but SPSS appears to write
them in all-uppercase.
@end table

This record is not present in files generated by older software.  See
also the @code{character_code} field in the machine integer info
record (@pxref{character-code}).

When the character encoding record and the machine integer info record
are both present, all system files observed in practice indicate the
same character encoding, e.g.@: 1252 as @code{character_code} and
@code{windows-1252} as @code{encoding}, 65001 and @code{UTF-8}, etc.

If, for testing purposes, a file is crafted with different
@code{character_code} and @code{encoding}, it seems that
@code{character_code} controls the encoding for all strings in the
system file before the dictionary termination record, including
strings in data (e.g.@: string missing values), and @code{encoding}
controls the encoding for strings following the dictionary termination
record.

@node Long String Value Labels Record
@section Long String Value Labels Record

This record, if present, specifies value labels for long string
variables.

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Repeated up to exactly @code{count} bytes.} */
int32               var_name_len;
char                var_name[];
int32               var_width;
int32               n_labels;
long_string_label   labels[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 21.

@item int32 size;
Always set to 1.

@item int32 count;
The number of bytes following the header until the next header.

@item int32 var_name_len;
@itemx char var_name[];
The number of bytes in the name of the variable that has long string
value labels, plus the variable name itself, which consists of exactly
@code{var_name_len} bytes.  The variable name is not padded to any
particular boundary, nor is it null-terminated.

@item int32 var_width;
The width of the variable, in bytes, which will be between 9 and
32767.

@item int32 n_labels;
@itemx long_string_label labels[];
The long string labels themselves.  The @code{labels} array contains
exactly @code{n_labels} elements, each of which has the following
substructure:

@example
int32               value_len;
char                value[];
int32               label_len;
char                label[];
@end example

@table @code
@item int32 value_len;
@itemx char value[];
The string value being labeled.  @code{value_len} is the number of
bytes in @code{value}; it is equal to @code{var_width}.  The
@code{value} array is not padded or null-terminated.

@item int32 label_len;
@itemx char label[];
The label for the string value.  @code{label_len}, which must be
between 0 and 120, is the number of bytes in @code{label}.  The
@code{label} array is not padded or null-terminated.
@end table
@end table

@node Long String Missing Values Record
@section Long String Missing Values Record

This record, if present, specifies missing values for long string
variables.

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Repeated up to exactly @code{count} bytes.} */
int32               var_name_len;
char                var_name[];
char                n_missing_values;
int32               value_len;
char                values[values_len * n_missing_values];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 22.

@item int32 size;
Always set to 1.

@item int32 count;
The number of bytes following the header until the next header.

@item int32 var_name_len;
@itemx char var_name[];
The number of bytes in the name of the long string variable that has
missing values, plus the variable name itself, which consists of
exactly @code{var_name_len} bytes.  The variable name is not padded to
any particular boundary, nor is it null-terminated.

@item char n_missing_values;
The number of missing values, either 1, 2, or 3.  (This is, unusually,
a single byte instead of a 32-bit number.)

@item int32 value_len;
The length of each missing value string, in bytes.  This value should
be 8, because long string variables are at least 8 bytes wide (by
definition), only the first 8 bytes of a long string variable's
missing values are allowed to be non-spaces, and any spaces within the
first 8 bytes are included in the missing value here.

@item char values[values_len * n_missing_values]
The missing values themselves, without any padding or null
terminators.
@end table

An earlier version of this document stated that @code{value_len} was
repeated before each of the missing values, so that there was an extra
@code{int32} value of 8 before each missing value after the first.
Old versions of PSPP wrote data files in this format.  Readers can
tolerate this mistake, if they wish, by noticing and skipping the
extra @code{int32} values, which wouldn't ordinarily occur in strings.

@node Data File and Variable Attributes Records
@section Data File and Variable Attributes Records

The data file and variable attributes records represent custom
attributes for the system file or for individual variables in the
system file, as defined on the DATAFILE ATTRIBUTE (@pxref{DATAFILE
ATTRIBUTE,,,pspp, PSPP Users Guide}) and VARIABLE ATTRIBUTE commands
(@pxref{VARIABLE ATTRIBUTE,,,pspp, PSPP Users Guide}), respectively.

@example
/* @r{Header.} */
int32               rec_type;
int32               subtype;
int32               size;
int32               count;

/* @r{Exactly @code{count} bytes of data.} */
char                attributes[];
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 17 for a data file attribute record or
to 18 for a variable attributes record.

@item int32 size;
The size of each element in the @code{attributes} member. Always set to 1.

@item int32 count;
The total number of bytes in @code{attributes}.

@item char attributes[];
The attributes, in a text-based format.

In record subtype 17, this field contains a single attribute set.  An
attribute set is a sequence of one or more attributes concatenated
together.  Each attribute consists of a name, which has the same
syntax as a variable name, followed by, inside parentheses, a sequence
of one or more values.  Each value consists of a string enclosed in
single quotes (@code{'}) followed by a line feed (byte 0x0a).  A value
may contain single quote characters, which are not themselves escaped
or quoted or required to be present in pairs.  There is no apparent
way to embed a line feed in a value.  There is no distinction between
an attribute with a single value and an attribute array with one
element.

In record subtype 18, this field contains a sequence of one or more
variable attribute sets.  If more than one variable attribute set is
present, each one after the first is delimited from the previous by
@code{/}.  Each variable attribute set consists of a long
variable name,
followed by @code{:}, followed by an attribute set with the same
syntax as on record subtype 17.

System files written by @code{Stata 14.1/-savespss- 1.77 by
S.Radyakin} may include multiple records with subtype 18, one per
variable that has variable attributes.

The total length is @code{count} bytes.
@end table

@subheading Example

A system file produced with the following VARIABLE ATTRIBUTE commands
in effect:

@example
VARIABLE ATTRIBUTE VARIABLES=dummy ATTRIBUTE=fred[1]('23') fred[2]('34').
VARIABLE ATTRIBUTE VARIABLES=dummy ATTRIBUTE=bert('123').
@end example

@noindent
will contain a variable attribute record with the following contents:

@example
0000  07 00 00 00 12 00 00 00  01 00 00 00 22 00 00 00  |............"...|
0010  64 75 6d 6d 79 3a 66 72  65 64 28 27 32 33 27 0a  |dummy:fred('23'.|
0020  27 33 34 27 0a 29 62 65  72 74 28 27 31 32 33 27  |'34'.)bert('123'|
0030  0a 29                                             |.)              |
@end example

@menu
* Variable Roles::
@end menu

@node Variable Roles
@subsection Variable Roles

A variable's role is represented as an attribute named @code{$@@Role}.
This attribute has a single element whose values and their meanings
are:

@table @code
@item 0
Input.  This, the default, is the most common role.
@item 1
Output.
@item 2
Both.
@item 3
None.
@item 4
Partition.
@item 5
Split.
@end table

@node Extended Number of Cases Record
@section Extended Number of Cases Record

The file header record expresses the number of cases in the system
file as an int32 (@pxref{File Header Record}).  This record allows the
number of cases in the system file to be expressed as a 64-bit number.

@example
int32               rec_type;
int32               subtype;
int32               size;
int32               count;
int64               unknown;
int64               ncases64;
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 7.

@item int32 subtype;
Record subtype.  Always set to 16.

@item int32 size;
Size of each element.  Always set to 8.

@item int32 count;
Number of pieces of data in the data part.  Alway set to 2.

@item int64 unknown;
Meaning unknown.  Always set to 1.

@item int64 ncases64;
Number of cases in the file as a 64-bit integer.  Presumably this
could be -1 to indicate that the number of cases is unknown, for the
same reason as @code{ncases} in the file header record, but this has
not been observed in the wild.
@end table

@node Other Informational Records
@section Other Informational Records

This chapter documents many specific types of extension records are
documented here, but others are known to exist.  PSPP ignores unknown
extension records when reading system files.

The following extension record subtypes have also been observed, with
the following believed meanings:

@table @asis
@item 6
Date info, probably related to USE (according to Aapi H@"am@"al@"ainen).

@item 12
A UUID in the format described in RFC 4122.  Only two examples
observed, both written by SPSS 13, and in each case the UUID contained
both upper and lower case.

@item 24
XML that describes how data in the file should be displayed on-screen.
@end table

@node Dictionary Termination Record
@section Dictionary Termination Record

The dictionary termination record separates all other records from the
data records.

@example
int32               rec_type;
int32               filler;
@end example

@table @code
@item int32 rec_type;
Record type.  Always set to 999.

@item int32 filler;
Ignored padding.  Should be set to 0.
@end table

@node Data Record
@section Data Record

The data record must follow all other records in the system file.
Every system file must have a data record that specifies data for at
least one case.  The format of the data record varies depending on the
value of @code{compression} in the file header record:

@table @asis
@item 0: no compression
Data is arranged as a series of 8-byte elements.
Each element corresponds to
the variable declared in the respective variable record (@pxref{Variable
Record}).  Numeric values are given in @code{flt64} format; string
values are literal characters string, padded on the right when
necessary to fill out 8-byte units.

@item 1: bytecode compression
The first 8 bytes
of the data record is divided into a series of 1-byte command
codes.  These codes have meanings as described below:

@table @asis
@item 0
Ignored.  If the program writing the system file accumulates compressed
data in blocks of fixed length, 0 bytes can be used to pad out extra
bytes remaining at the end of a fixed-size block.

@item 1 through 251
A number with
value @var{code} - @var{bias}, where
@var{code} is the value of the compression code and @var{bias} is the
variable @code{bias} from the file header.  For example,
code 105 with bias 100.0 (the normal value) indicates a numeric variable
of value 5.

A code of 0 (after subtracting the bias) in a string field encodes
null bytes.  This is unusual, since a string field normally encodes
text data, but it exists in real system files.

@item 252
End of file.  This code may or may not appear at the end of the data
stream.  PSPP always outputs this code but its use is not required.

@item 253
A numeric or string value that is not
compressible.  The value is stored in the 8 bytes following the
current block of command bytes.  If this value appears twice in a block
of command bytes, then it indicates the second group of 8 bytes following the
command bytes, and so on.

@item 254
An 8-byte string value that is all spaces.

@item 255
The system-missing value.
@end table

The end of the 8-byte group of bytecodes is followed by any 8-byte
blocks of non-compressible values indicated by code 253.  After that
follows another 8-byte group of bytecodes, then those bytecodes'
non-compressible values.  The pattern repeats to the end of the file
or a code with value 252.

@item 2: ZLIB compression
The data record consists of the following, in order:

@itemize @bullet
@item
ZLIB data header, 24 bytes long.

@item
One or more variable-length blocks of ZLIB compressed data.

@item
ZLIB data trailer, with a 24-byte fixed header plus an additional 24
bytes for each preceding ZLIB compressed data block.
@end itemize

The ZLIB data header has the following format:

@example
int64               zheader_ofs;
int64               ztrailer_ofs;
int64               ztrailer_len;
@end example

@table @code
@item int64 zheader_ofs;
The offset, in bytes, of the beginning of this structure within the
system file.

@item int64 ztrailer_ofs;
The offset, in bytes, of the first byte of the ZLIB data trailer.

@item int64 ztrailer_len;
The number of bytes in the ZLIB data trailer.  This and the previous
field sum to the size of the system file in bytes.
@end table

The data header is followed by @code{(ztrailer_len - 24) / 24} ZLIB
compressed data blocks.  Each ZLIB compressed data block begins with a
ZLIB header as specified in RFC@tie{}1950, e.g.@: hex bytes @code{78
01} (the only header yet observed in practice).  Each block
decompresses to a fixed number of bytes (in practice only
@code{0x3ff000}-byte blocks have been observed), except that the last
block of data may be shorter.  The last ZLIB compressed data block
gends just before offset @code{ztrailer_ofs}.

The result of ZLIB decompression is bytecode compressed data as
described above for compression format 1.

The ZLIB data trailer begins with the following 24-byte fixed header:

@example
int64               bias;
int64               zero;
int32               block_size;
int32               n_blocks;
@end example

@table @code
@item int64 int_bias;
The compression bias as a negative integer, e.g.@: if @code{bias} in
the file header record is 100.0, then @code{int_bias} is @minus{}100
(this is the only value yet observed in practice).

@item int64 zero;
Always observed to be zero.

@item int32 block_size;
The number of bytes in each ZLIB compressed data block, except
possibly the last, following decompression.  Only @code{0x3ff000} has
been observed so far.

@item int32 n_blocks;
The number of ZLIB compressed data blocks, always exactly
@code{(ztrailer_len - 24) / 24}.
@end table

The fixed header is followed by @code{n_blocks} 24-byte ZLIB data
block descriptors, each of which describes the compressed data block
corresponding to its offset.  Each block descriptor has the following
format:

@example
int64               uncompressed_ofs;
int64               compressed_ofs;
int32               uncompressed_size;
int32               compressed_size;
@end example

@table @code
@item int64 uncompressed_ofs;
The offset, in bytes, that this block of data would have in a similar
system file that uses compression format 1.  This is
@code{zheader_ofs} in the first block descriptor, and in each
succeeding block descriptor it is the sum of the previous desciptor's
@code{uncompressed_ofs} and @code{uncompressed_size}.

@item int64 compressed_ofs;
The offset, in bytes, of the actual beginning of this compressed data
block.  This is @code{zheader_ofs + 24} in the first block descriptor,
and in each succeeding block descriptor it is the sum of the previous
descriptor's @code{compressed_ofs} and @code{compressed_size}.  The
final block descriptor's @code{compressed_ofs} and
@code{compressed_size} sum to @code{ztrailer_ofs}.

@item int32 uncompressed_size;
The number of bytes in this data block, after decompression.  This is
@code{block_size} in every data block except the last, which may be
smaller.

@item int32 compressed_size;
The number of bytes in this data block, as stored compressed in this
system file.
@end table
@end table

@setfilename ignored