File: index.html

package info (click to toggle)
hdf5 1.8.13%2Bdocs-15
  • links: PTS, VCS
  • area: main
  • in suites: jessie-kfreebsd
  • size: 171,520 kB
  • sloc: ansic: 387,158; f90: 35,195; sh: 20,035; xml: 17,780; cpp: 13,516; makefile: 1,487; perl: 1,299; yacc: 327; lex: 178; ruby: 37
file content (1274 lines) | stat: -rwxr-xr-x 59,395 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
<!doctype HTML public "-//W3C//DTD HTML 4.0 Frameset//EN">
<html>
<head>

<title>Metadata Caching in HDF5</title>

<!--(Meta)==========================================================-->


<!--(Links)=========================================================-->

<link href="../../UG/ed_styles/NewUGelect.css" rel="stylesheet" type="text/css">

<!--( Begin styles definition )=====================================-->
<!--     Replaced with external stylesheet 'styles_NewUG.css'.      -->
<!--( End styles definition )=======================================-->

</head>

<body>
<br />
<p>Return to the <a href="../../Advanced.html">"Advanced Topics"</a> page.</p>
<br /><hr>


<!-- #BeginLibraryItem "/ed_libs/Copyright.lbi" -->
<!--
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  * Copyright by The HDF Group.                                               *
  * Copyright by the Board of Trustees of the University of Illinois.         *
  * All rights reserved.                                                      *
  *                                                                           *
  * This file is part of HDF5.  The full HDF5 copyright notice, including     *
  * terms governing use, modification, and redistribution, is contained in    *
  * the files COPYING and Copyright.html.  COPYING can be found at the root   *
  * of the source code distribution tree; Copyright.html can be found at the  *
  * root level of an installed copy of the electronic HDF5 document set and   *
  * is linked from the top-level documents page.  It can also be found at     *
  * http://www.hdfgroup.org/HDF5/doc/Copyright.html.  If you do not have      *
  * access to either file, you may request a copy from help@hdfgroup.org.     *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 -->
<!-- #EndLibraryItem --><!-- HEADER LEFT "Metadata Caching in HDF5" -->
<!-- HEADER RIGHT "Metadata Caching in HDF5" -->


<a name="TOP"></a>
<a name="MetadataCache">
<h1>Metadata Caching in HDF5</h1>
</a>

<a name="Intro">
<h2>1. Introduction</h2>
</a>

<p>In the 1.6.4 release, we introduced a re-implementation of the
  metadata cache.  That release contained an incomplete version of
  the cache which could not be controlled via the API.  The version
  in the 1.8 release is more mature, and includes new API calls that
  allow the user program to configure the metadata cache both on file
  open and at run time.

<p>From the user perspective, the most striking effect of the new
  cache should be a large reduction in the cache memory requirements
  when working with complex HDF5 files.

<p>Those working with such files may also notice a reduction in
  file close time.

<p>Those working with HDF5 files with simple structure shouldn't
  notice any particular changes in most cases.  In rare cases,
  there may be a significant improvement in performance.

<p>The remainder of this document contains an architectural
  overview of the old and new metadata caches, a discussion of
  algorithms used to automatically adjust cache size to circumstances,
  and a high level discussion of the cache configuration controls.
  It can be safely skipped by anyone who works only with HDF5 files
  with relatively simple structure (i.e.  no huge groups, no datasets
  with large numbers of chunks, and no objects with large numbers of
  attributes.)

<p>On the other hand, it is mandatory reading if you want to use
  something other than the default metadata cache configuration.  The
  documentation on the metadata cache related API calls will not make
  much sense without this background.
  
<h2>2. Old and New Metadata Cache</h2>
<h3>2.1. The Old Metadata Cache</h3>

<p>The old metadata cache indexed the cache with a hash table
  with no provision for collisions.  Instead, collisions were handled
  by evicting the existing entry to make room for the new entry.
  Aside from flushes, there was no other mechanism for evicting
  entries, so the replacement policy could best be described as
  "Evict on Collision".

<p>As a result, if two frequently used entries hashed to the same
  location, they would evict each other regularly.  To decrease
  the likelihood of this situation, the default hash table size
  was set fairly large -- slightly more than 10,000.  This worked
  well, but since the size of metadata entries is not bounded,
  and since entries were only evicted on collision, the large
  hash table size allowed the cache size to explode when working
  with HDF5 files with complex structure.

<p>The "Evict on Collision" replacement policy also caused
  problems with the parallel version of the HDF5 library, as a
  collision with a dirty entry could force a write in response
  to a metadata read.  Since all metadata writes must be collective
  in the parallel case while reads need not be, this could cause
  the library to hang if only some of the processes participated
  in a metadata read that forced a write.  Prior to the
  implementation of the new metadata cache, we dealt with this
  issue by maintaining a shadow cache for dirty entries evicted
  by a read.
  
<h3>2.2. The New Metadata Cache</h3>

<p>The new metadata cache was designed to address the above
  issues.  After implementation, it became evident that the
  working set size for HDF5 files varies widely depending on
  both structure and access pattern.  Thus it was necessary to
  add support for cache size adjustment under either automatic
  or user program control (see section 2.3 for details).

<p>When the cache is operating under direct user program
  control, it is also possible to temporarily disable evictions
  from the metadata cache so as to maximize raw data throughput
  at the expense of allowing the cache to grow without bound
  until evictions are enabled again.

<p>Structurally, the new metadata cache can be thought of as a
  heavily modified version of the UNIX buffer cache as described
  in chapter three of M. J. Bach's "The Design of the UNIX Operating
  System"  In essence, the UNIX buffer cache uses a hash table with
  chaining to index a pool of fixed size buffers.  It uses the LRU
  replacement policy to select candidates for eviction.

<p>Since HDF5 metadata entries are not of fixed size, and may
  grow arbitrarily large, the size of the new metadata cache
  cannot be controlled by setting a maximum number of entries.
  Instead the new cache keeps a running sum of the sizes of all
  entries, and attempts to evict entries as necessary to stay
  within a user specified maximum size.  (Note the use of the word
  "attempts" here -- as will be seen, it is possible for the cache
  to exceed its currently specified maximum size.)  At present, the
  LRU replacement policy is the only option for selecting candidates
  for eviction.

<p>Per the standard unix buffer cache, dirty entries are given
  two passes through the LRU list before being evicted.  The first
  time they reach the end of the LRU list, they are flushed, marked
  as clean, and moved to the head of the LRU list.  When a clean
  entry reaches the end of the LRU list, it is simply evicted if
  space is needed.  

<p>The cache cannot evict entries that are locked, and thus it
  will temporarily grow beyond its maximum size if there are
  insufficient unlocked entries available for eviction.

<p>In the parallel version of the library, only the cache running
  under process 0 of the file communicator is allowed to write
  metadata to file.  All the other caches must retain dirty metadata
  until the process 0 cache tells them that the metadata is clean.

<p>Since all operations modifying metadata must be collective,
  all caches see the same stream of dirty metadata.  This fact
  is used to allow them to synchronize every n bytes of dirty
  metadata, where n is a user configurable value that defaults
  to 256 KB.

<p>To avoid sending the other caches messages from the future,
  process 0 must not write any dirty entries until it reaches a
  synchronization point.  When it reaches a synchronization point,
  it writes entries as needed, and then broadcasts the list of
  flushed entries to the other caches.  The caches on the other
  processes use this list to mark entries clean before they
  leave the synchronization point, allowing them to evict those
  entries as needed.

<p>The caches will also synchronize on a user initiated flush.

<p>To minimize overhead when running in parallel, the cache
  maintains a "clean" LRU list in addition to the regular LRU
  list.  This list contains only clean entries, and is used as a
  source of candidates for eviction when flushing dirty entries
  is not allowed.

<p>Since flushing entries is forbidden most of the time when
  running in parallel, the caches can be forced to exceed their
  maximum sizes if they run out of clean entries to evict.

<p>To decrease the likelihood of this event, the new cache allows
  the user to specify a minimum clean size -- which is a minimum
  total size of all the entries on the clean LRU plus all unused
  space in the cache.  
  
<p>While the clean LRU list is only maintained in the parallel 
  version of the HDF5 library, the notion of a minimum clean size 
  still applies in the serial case.  Here it is used to force a 
  mix of clean and dirty entries in the cache even in the write only
  case. 
  
<p>This in turn reduces the number of redundant flushes by avoiding 
  the case in which the cache fills with dirty metadata and all 
  entries must be flushed before a clean entry can be evicted to 
  make room for a new entry.

<p>Observe that in both the serial and parallel cases, the maintenance
  of a minimum clean size modifies the replacement policy, as dirty 
  entries may be flushed earlier than would otherwise be the case so 
  as to maintain the desired amount of clean and/or empty space in 
  the cache.

<p>While the new metadata cache only supports the LRU replacement
  policy at present, that may change.  Support for multiple
  replacement policies was very much in mind when the cache was
  designed, as was the ability to switch replacement policies at
  run time.  The situation has been complicated by the later addition
  of the adaptive cache resizing requirement, as two of the resizing
  algorithms piggyback on the LRU list.  However, if there is need
  for additional replacement policies, it shouldn't be too hard to
  implement them.
 
<!-- NEW PAGE -->
<h2>3. Adaptive Cache Resizing in the New Metadata Cache</h2>

<p>As mentioned earlier, the metadata working set size for a HDF5
  file varies wildly depending on the structure of the file and the
  access pattern.  For example, a 2MB limit on metadata cache size
  is excessive for an H5repack of almost all HDF5 files we have tested.
  However, I have a file submitted by one of our users that that will
  run a 13% hit rate with this cache size, and will lock up one of our
  linux boxes using the old metadata cache.  Increase the new metadata
  cache size to 4 MB, and the hit rate exceeds 99%.

<p>In this case the main culprit is a root group with more than
  20,000 entries in it.  As a result, the root group heap exceeds
  1 MB, which tends to crowd out the rest of the metadata in a 2 MB
  cache

<p>This case and a number of synthetic tests convinced us that we
  needed to modify the new metadata cache to expand and contract
  according to need within user specified bounds.

<p>I was unable to find any previous work on this problem, so I
  invented solutions as I went along.  If you are aware of prior
  work, please send me references.  The closest I was able to come
  was a group of embedded CPU designers who were turning off
  sections of their cache to conserve power.
  
<h3>3.1. Increasing the Cache Size</h3>

<p>In the context of the HDF5 library, the problem of increasing the
  cache size as necessary to contain the current working set turns
  out to involve two rather different issues.

<p>The first of these, which was recognized immediately, is the 
  problem of recognizing long term changes in working set size, and
  increasing the cache size accordingly, while not reacting to 
  transients.

<p>The second, which I recognized the hard way, is to adjust the cache 
  size for sudden, dramatic increases in working set size caused by 
  requests for large pieces of metadata which may be larger than the 
  current metadata cache size.

<p>The algorithms for handling these situations are discussed below.
  These problems are largely orthogonal to each other, so both algorithms
  may be used simultaneously.
  
<h4>3.1.1. Hit Rate Threshold Cache Size Increment</h4>

<p>Perhaps the most obvious heuristic for identifying cases in which
  the cache is too small involves monitoring the hit rate.  If the hit
  rate is low for a while, and the cache is at its current maximum size,
  the current maximum cache size is probably too small.  

<p>The hit rate threshold algorithm for increasing cache size
  applies this intuition directly.

<p>Hit rate statistics are collected over a user specified number
  of cache accesses.  This period is known as an epoch.

<p>At the end of each epoch, the hit rate is computed, and the
  counters are reset.  If the hit rate is below a user specified
  threshold and the cache is at its current maximum size, the maximum
  size of the cache is increased by a user specified multiple.  If
  required, the new cache maximum size is clipped to stay within the
  user specified upper bound on the maximum cache size, and optionally,
  within a user specified maximum increment.

<p>My tests indicate that this algorithm works well in most cases.
  However, in a synthetic test in which hit rate increased slowly with
  cache size, and load remained steady for many epochs, I observed a
  case in which cache size increased until hit rate just exceeded
  the specified minimum and then stalled.  This is a problem, as to
  avoid volatility, it is necessary to set the minimum hit rate
  threshold well below the desired hit rate.  Thus we may find ourselves
  with a cache running with a 91% hit rate when we really want it to
  increase its size until the hit rate is about 99%.

<p>If this case occurs frequently in actual use, I will have to
  come up with an improved algorithm.  Please let me know if you 
  see this behavior.  However, I had to work rather hard to create 
  it in my synthetic tests, so I would expect it to be uncommon.
  
<h4>3.1.2. Flash Cache Size Increment</h4>

<p>A fundamental problem with the above algorithm is that contains the 
  hidden assumption that cache entries are relatively small in comparison
  to the cache itself.  While I knew this assumption was not generally 
  true when I developed the algorithm, I thought that cases where it 
  failed would be so rare as to not be worth considering, as even if 
  they did occur, the above algorithm would rectify the situation 
  within an epoch or two.

<p>While it is true that such occurances are rare, and it is true that
  the hit rate threshold cache size increment algorithm will rectify
  the situation eventually, the performance degradation experienced 
  by users while waiting for the epoch to end was so extreme that 
  some way of accelerating response to such situations was essential.

<p>To understand the problem, consider the following use case:
  
<p>Suppose we create a group, and then repeatedly create a new data 
  set in the group, write some data to it and then close it.

<p>In some versions of the HDF5 file format, the names of the datasets 
  will be stored in a local heap associated with the group, and the 
  space for that heap will be allocated in a single, contiguous chunk.  
  When this local heap is full, we allocate a new chunk twice the size 
  of the old, copy the data from the old local heap into the new, and 
  discard the old local heap.

<p>By default, the minimum metadata cache size is set to 2 MB.  Thus in
  this use case, our hit rate will be fine as long as the local heap is
  no larger than a little less than 2 MB, as the group related metadata 
  is accessed frequently and never evicted, and the data set related 
  metadata is never accessed once the data set is closed, and thus is 
  evicted smoothly to make room for new data sets.

<p>All this changes abruptly when the local heap finally doubles in size
  to a value above the slightly less than 2 MB limit.  All of a sudden, 
  the local heap is the size of the metadata cache, and the cache must 
  constantly swap it in to access it, and then swap it out to make room 
  for other metadata.  

<p>The hit rate threshold based algorithm for increasing the cache 
  size will fix this problem eventually, but performance will be very 
  bad until it does, as the metadata cache will largely ineffective 
  until its size is increase.

<p>An obvious heuristic for addressing this "big rock in a small pond" 
  issue is to watch for large "incoming rocks", and increase the size
  of the "pond" if the rock is so big that it will force most of the 
  "water" out of the "pond".

<p>The add space flash cache size increment algorithm algorithm applies 
  this intuition directly:

<p>Let x be either the size of a newly inserted entry, a newly loaded
  entry, or the number of bytes by which the size of an existing entry 
  has been increased (i.e. the size of the "rock").

<p>If x is greater than some user specified fraction of the current 
  maximum cache size, increase the current maximum cache size by x 
  times some user specified multiple, less any free space that was in
  the cache to begin with.  Further, to avoid confusing the other 
  cache size increment/decrement code, start a new epoch.

<p>At present, this algorithm pays no attention to any user specified limit
  on the maximum size of any single cache size increase, but it DOES stay
  within the user specified upper bound on the maximum cache size.

<p>While it should be easy to see how this algorithm could be fooled into 
  inactivity by large number of entries that were not quite large enough
  to cross the threshold, in practice it seems to work reasonably well.

<p>Needless to say, I will revisit the issue should this cease to be the
  case.

<h3>3.2. Decreasing the Cache Size</h3>

<p>Identifying cases in which the maximum cache size is larger than 
  necessary turned out to be more difficult.
  
<h4>3.2.1. Hit Rate Threshold Cache Size Reduction</h4>

<p>One obvious heuristic is to monitor the hit rate and guess that we
  can safely decrease cache size if hit rate exceeds some user supplied
  threshold (say .99995).

<!-- NEW PAGE -->

<p>The hit rate threshold size decrement algorithm implemented in the
  new metadata cache implements this intuition as follows:

<p>At the end of each epoch (this is the same epoch that is used in
  the cache size increment algorithm), the hit rate is compared with
  the user specified threshold.  If the hit rate exceeds that threshold,
  the current maximum cache size is decreased by a user specified factor.
  If required, the size of the reduction is clipped to stay within a user
  specified lower bound on the maximum cache size, and optionally, within
  a user specified maximum decrement.

<p>In my synthetic tests, this algorithm works poorly.  Even with a
  very high threshold and a small maximum reduction, it results in
  cache size oscillations.  The size increment code typically increments
  maximum cache size above the working set size.  This results in a high
  hit rate, which causes the threshold size decrement code to reduce the
  maximum cache size below the working set size, which causes hit rate to
  crash causing the cycle to repeat.  The resulting average hit rate is
  poor.

<p>It remains to be seen if this behavior will be seen in the field.
  The algorithm is available for use, but it wouldn't be my first choice.
  If you use it, please report back.
  
<h4>3.2.2. Ageout Cache Size Reduction</h4>

<p>Another heuristic for dealing with oversized cache conditions is to
  look for entries that haven't been accessed for a long time, evict
  them, and reduce the cache size accordingly.

<p>The age out cache size reduction applies this intuition as follows:
  At the end of each epoch (again the same epoch as used in the cache
  size increment algorithm), all entries that haven't been accessed for
  a user configurable number of epochs (1 - 10 at present) are evicted.
  The maximum cache size is then reduced to equal the sum of the sizes
  of the remaining entries.  The size of the reduction is clipped to stay
  within a user specified lower bound on maximum cache size, and
  optionally, within a user specified maximum decrement.

<p>In addition, the user may specify a minimum fraction of the cache
  which must be empty before the cache size is reduced.  Thus if an
  empty reserve of 0.1 was specified on a 10 MB cache, there would be no
  cache size reduction unless the eviction of aged out entries resulted
  in more than 1 MB of empty space.  Further, even after the reduction,
  the cache would be one tenth empty.

<p>In my synthetic tests, the age out algorithm works rather well,
  although it is somewhat sensitive to the epoch length and age out
  period selection.

<h4>3.2.3. Ageout With Hit Rate Threshold Cache Size Reduction</h4>

<p>To address these issues, I combined the hit rate threshold and
  age out heuristics.

<p>Age out with threshold works just like age out, except that the
  algorithm is not run unless the hit rate exceeded a user specified
  threshold in the previous epoch.

<p>In my synthetic tests, age out with threshold seems to work
  nicely, with no observed oscillation.  Thus I have selected it as
  the default cache size reduction algorithm.

<p>For those interested in such things, the age out algorithm is
  implemented by inserting a marker entry at the head of the LRU
  list at the beginning of each epoch.  Entries that haven't been
  accessed for at least n epochs are simply entries that appear in
  the LRU list after the n-th marker at the end of an epoch.

<h2>4. Configuring the New Metadata Cache</h2>

<p>Due to lack of resources, the design work on the automatic cache
  size adjustment algorithms was done hastily, using primarily synthetic
  tests.  I don't think I spent more than a couple weeks writing and
  running performance tests -- most time went into coding and
  functional testing.

<p>As a result, while I think the algorithms provided for adaptive
  cache resizing will work well in actual use, I don't really know
  (although preliminary results from the field are promising).
  Fortunately, the issue shouldn't arise for the vast majority of
  HDF5 users, and those for whom it may arise should be savvy enough
  to recognize problems and deal with them.

<p>For this latter class of users, I have implemented a number of
  new API calls allowing the user to select and configure the cache
  resize algorithms, or to turn them off and control cache size
  directly from the user program.  There are also API calls that
  allow the user program to monitor hit rate and cache size.

<p>From the user perspective, all the cache configuration data
  for a given file is contained in an instance of the
  H5AC_cache_config_t structure -- the definition of which is given
  below:

<pre>
        typedef struct H5AC_cache_config_t
        {
            /* general configuration fields: */
            int                         version;

            hbool_t                     rpt_fcn_enabled;

            hbool_t                     open_trace_file;
            hbool_t                     close_trace_file;
            char                        trace_file_name
                                          [H5AC__MAX_TRACE_FILE_NAME_LEN + 1];

            hbool_t                     evictions_enabled;

            hbool_t                     set_initial_size;
            size_t                      initial_size;

            double                      min_clean_fraction;

            size_t                      max_size;
            size_t                      min_size;

            long int                    epoch_length;


            /* size increase control fields: */
            enum H5C_cache_incr_mode    incr_mode;

            double                      lower_hr_threshold;

            double                      increment;

            hbool_t                     apply_max_increment;
            size_t                      max_increment;

            enum H5C_cache_flash_incr_mode flash_incr_mode;
            double                      flash_multiple;
	    double                      flash_threshold;


            /* size decrease control fields: */
            enum H5C_cache_decr_mode    decr_mode;

            double                      upper_hr_threshold;

            double                      decrement;

            hbool_t                     apply_max_decrement;
            size_t                      max_decrement;

            int                         epochs_before_eviction;

            hbool_t                     apply_empty_reserve;
            double                      empty_reserve;


            /* parallel configuration fields: */
            int                         dirty_bytes_threshold;

        } H5AC_cache_config_t;
</pre>

<p>This structure is defined in <code>H5ACpublic.h</code>.  Each 
  field is discussed below and in the associated header comment.

<p>The C API allows you get and set this structure directly.  Unfortunately
  the Fortran API has to do this with individual parameters for each of the
  fields (with the exception of version).

<p>While the API calls are discussed individually in the reference
  manual, the following high level discussion of what fields to change
  for different purposes should be useful.
  
<h3>4.1. General Configuration</h3>

<p>The version field is intended to allow THG to change the
  <code>H5AC_cache_config_t</code> structure without breaking old 
  code.  For now, this field should always be set to 
  <code>H5AC__CURR_CACHE_CONFIG_VERSION</code>, even when
  you are getting the current configuration data from the cache.  The
  library needs the version number to know where fields are located with
  reference to the supplied base address.

<p>The <code>rpt_fcn_enabled</code> field is a boolean flag that allows 
  you to turn on and off the resize reporting function that reports the 
  activities of the adaptive cache resize code at the end of each epoch 
  -- assuming that it is enabled.

<p>The report function is unsupported, so you are on your own if you use
  it.  Since it dumps status data to stdout, you should not attempt to use
  it with Windows unless you modify the source.  You may find it useful if
  you want to experiment with different adaptive resize configurations.
  It is also a convenient way of diagnosing poor cache configuration.
  Finally, if you do lots of runs with identical behavior, you can use it
  to determine the metadata cache size needed in each phase of your
  program so you can set the required cache sizes manually.

<p>The trace file fields are also unsupported.  They allow one to open
  and close a trace file in which all calls to the metadata cache are
  logged in a user specified file for later analysis.  The feature is
  intended primarily for THG use in debugging or optimizing the metadata
  cache in cases where users in the field observe obscure failures or poor
  performance that we cannot re-create in the lab.  The trace file will
  allow us to re-create the exact sequence of cache operations that are
  triggering the problem.

<p>At present we do not have a play back utility for trace files,
  although I imagine that we will write one quickly when and if we need
  it.

<p>To enable the trace file, you load the full path of the desired
  trace file into <code>trace_file_name</code>, and set <code>
  open_trace_file</code> to <code>TRUE</code>.  In the parallel case, 
  an ASCII representation of the rank of each process is appended to 
  the supplied trace file name to create a unique trace file name for 
  that process.

<p>To close an open trace file, set close_trace_file to TRUE.

<p>It must be emphasized that you are on your own if you play with
  the trace file feature absent a request from THG.  Needless to say,
  the trace file feature is disabled by default.  If you enable it, you
  will take a large performance hit and generate huge trace files.


<p>The <code>evictions_enabled</code> field is a boolean flag allowing 
  the user to disable the eviction of entries from the metadata cache.
  Under normal operation conditions, this field will always be set
  to <code>TRUE</code>.

<p>In rare circumstances, the raw data throughput requirements may
  be so high that the user wishes to postpone metadata writes so as to
  reserve I/O throughput for raw data.  The <code>evictions_enabled</code>
  field exists to allow this -- although the user is to be warned that the
  metadata cache will grow without bound while evictions are disabled.
  Thus evictions should be re-enabled as soon as possible, and it may
  be wise to monitor cache size and statistics (to see how to enable
  statistics, see the debugging facilities section below).

<p>Evictions may only be disabled when the automatic cache resize
  code is disabled as well.  Thus to disable evictions, not only must
  the user set the <code>evictions_enabled</code> field to 
  <code>FALSE</code>, but he must also set <code>incr_mode</code> to 
  <code>H5C_incr__off</code>, set <code>flash_incr_mode</code> to 
  <code>H5C_flash_incr__off</code>, and set <code>decr_mode</code> to 
  <code>H5C_decr__off</code>.

<p>To re-enable evictions, just set <code>evictions_enabled</code> 
  back to <code>TRUE</code>.

<p>Before passing on to other subjects, it is worth re-iterating
  that disabling evictions is an extreme step.  Before attempting it,
  you might consider setting a large cache size manually, and flushing
  the cache just before high raw data throughput is required.  This
  may yield the desired results without the risks inherent in
  disabling evictions.


<p>The <code>set_initial_size</code> and <code>initial_size</code> 
  fields allow you to specify an initial maximum cache size.  If 
  <code>set_initial_size</code> is <code>TRUE</code>, 
  <code>initial_size</code> must lie in the interval 
  [<code>min_size</code>, <code>max_size</code>] (see below for a 
  discussion of the <code>min_size</code> and <code>max_size</code>
  fields).

<p>If you disable the adaptive cache resizing code (done by setting
  <code>incr_mode</code> to <code>H5C_incr__off</code>, 
  <code>flash_incr_mode</code> to <code>H5C_flash_incr__off</code>, and 
  <code>decr_mode</code> to <code>H5C_decr__off</code>), you
  can use these fields to control maximum cache size manually, as the
  maximum cache size will remain at the initial size.

<p>Note, that the maximum cache size is only modified when
  <code>set_initial_size</code> is <code>TRUE</code>.  This allows 
  the use of configurations specified at compile time to change 
  resize configuration without altering the current maximum size 
  of the cache.  Without this feature, an additional call would be 
  required to get the current maximum cache size so as to set the 
  <code>initial_size</code> to the current maximum cache size, and 
  thereby avoid changing it.

<p>The <code>min_clean_fraction</code> sets the current minimum 
  clean size as a fraction of the current max cache size.  While
  this field was originally used only in the parallel version of 
  the library, it now applies to the serial version as well. 
  Its value must lie in the range [0.0, 1.0].  0.01 is reasonable 
  in the serial case, and 0.3 in the parallel.

<p>A potential interaction, discovered at release 1.8.3, 
  between the enforcement of the <code>min_clean_fraction</code> 
  and the adaptive cache resize code can severely degrade performance. 
  While this interaction is easily dealt in the serial case by 
  setting <code>min_clean_fraction</code> to 0.01, the problem is 
  more difficult in the parallel case.  Please see the 
  &ldquo;<a href="#MDC_Interactions">Interactions</a>&rdquo; 
  section below for further details. 

<p>The <code>max_size</code> and <code>min_size</code> fields specify 
  the range of maximum sizes that may be set for the cache by the 
  automatic resize code.  <code>min_size</code> must be less than or 
  equal to <code>max_size</code>, and both must lie in the range
  [<code>H5C__MIN_MAX_CACHE_SIZE</code>, 
  <code>H5C__MAX_MAX_CACHE_SIZE</code>] -- currently [1 KB, 128 MB].  
  If you routinely run a cache size in the top half of this range, 
  you should increase the hash table size.  To do this, modify the 
  <code>H5C__HASH_TABLE_LEN</code> #define in <code>H5Cpkg.h</code>
  and re-compile.  At present, <code>H5C__HASH_TABLE_LEN</code>
  must be a power of two.

<p>The <code>epoch_length</code> is the number of cache accesses 
  between runs of the adaptive cache size control algorithms.  It 
  is ignored if these algorithms are turned off.  It must lie in 
  the range [<code>H5C__MIN_AR_EPOCH_LENGTH</code>, 
  <code>H5C__MAX_AR_EPOCH_LENGTH</code>] -- currently [100, 1000000].  
  The above constants are defined in <code>H5Cprivate.h</code>.  
  50000 is a reasonable value.

<h3>4.2. Increment Configuration</h3>

<p>The <code>incr_mode</code> field specifies the cache size increment
  algorithm used.  Its value must be a member of the 
  <code>H5C_cache_incr_mode</code> enum type -- currently either 
  <code>H5C_incr__off</code> or <code>H5C_incr__threshold</code> 
  (note the double underscores after "incr").  This type is defined 
  in <code>H5Cpublic.h</code>.

<p>If <code>incr_mode</code> is set to <code>H5C_incr__off</code>, 
  regular automatic cache size increases are disabled, and the 
  <code>lower_hr_threshold</code>, <code>increment</code>, 
  <code>apply_max_increment</code>, and <code>max_increment</code>
  fields are ignored.

<p>The <code>flash_incr_mode</code> field specifies the flash cache size
  increment algorithm used.  Its value must be a member of the
  <code>H5C_cache_flash_incr_mode</code> enum type -- currently either 
  <code>H5C_flash_incr__off</code> or <code>H5C_flash_incr__add_space</code>
  (note the double underscores after "incr").  This type is defined 
  in <code>H5Cpublic.h</code>.

<p>If <code>flash_incr_mode</code> is set to <code>H5C_flash_incr__off</code>,
  flash cache size increases are disabled, and the 
  <code>flash_multiple</code>, and <code>flash_threshold</code>, 
  fields are ignored.

<h4>4.2.1. Hit Rate Threshold Cache Size Increase Configuration</h4>

<p>If <code>incr_mode</code> is <code>H5C_incr__threshold</code>, 
  the cache size is increased via the hit rate threshold algorithm.  
  The remaining fields in the section are then used as follows:

<p><code>lower_hr_threshold</code> is the threshold below which 
  the hit rate must fall to trigger an increase.  The value must 
  lie in the range [0.0 - 1.0].  In my tests, a relatively high 
  value seems to work best -- 0.9 for example.

<p><code>increment</code> is the factor by which the old maximum 
  cache size is multiplied to obtain an initial new maximum cache 
  size when an increment is needed.  The actual change in size may be
  smaller as required by <code>max_size</code> (above) and 
  <code>max_increment</code> (discussed below).  <code>increment</code>
  must be greater than or equal to 1.0.  If you set it to 1.0, you 
  will effectively turn off the increment code.  2.0 is a reasonable value.

<p><code>apply_max_increment</code> and <code>max_increment</code> 
  allow the user to specify a maximum increment.  If 
  <code>apply_max_increment</code> is <code>TRUE</code>, the cache
  size will never be increased by more than the number of bytes
  specified in <code>max_increment</code> in any single increase.

<h4>4.2.2. Flash Cache Size Increase Configuration</h4>

<p>If <code>flash_incr_mode</code> is set to 
  <code>H5C_flash_incr__add_space</code>, flash cache size increases
  are enabled.  The size of the cache will be increased under the 
  following circumstances:

<p>Let t be the current maximum cache size times the value of the
  <code>flash_threshold</code> field.

<p>Let x be either the size of the newly inserted entry, the size of
  the newly loaded entry, or the number of bytes added to the size of 
  the entry under consideration for triggering a flash cache size 
  increase.

<p>If t < x, the basic condition for a flash cache size increase 
  is met, and we proceed as follows:

<p>Let space_needed equal x less the amount of free space in the cache.

<p>Further, let increment equal space_needed times the value of the
  <code>flash_multiple</code> field.  If increment plus the current cache 
  size is greater than <code>max_size</code> (discussed above), reduce 
  increment so that increment plus the current cache size is equal to 
  <code>max_size</code>.

<p>If increment is greater than zero, increase the current cache size
  by increment.  To avoid confusing the other cache size increment
  or decrement algorithms, start a new epoch.  Note however, that we
  do not cycle the epoch markers if some variant of the age out 
  algorithm is in use.

<p>The use of the <code>flash_threshold</code> field is discussed 
  above.  It must be a floating point value in the range of [0.1, 1.0].
  0.25 is a reasonable value.

<p>The use of the <code>flash_multiple</code> field is also discussed
  above.  It must be a floating point value in the range of [0.1, 10.0].
  1.4 is a reasonable value.
  
<h3>4.3. Decrement Configuration</h3>

<p>The <code>decr_mode</code> field specifies the cache size decrement 
  algorithm used.  Its value must be a member of the 
  <code>H5C_cache_decr_mode enum</code> type -- currently either 
  <code>H5C_decr__off</code>, <code>H5C_decr__threshold</code>,
  <code>H5C_decr__age_out</code>, or 
  <code>H5C_decr__age_out_with_threshold</code> (note the
  double underscores after "decr").  This type is defined in
  <code>H5Cpublic.h</code>.

<p>If <code>decr_mode</code> is set to <code>H5C_decr__off</code>, 
  automatic cache size decreases are disabled, and the remaining 
  fields in the cache size decrease control section are ignored.
  
<h4>4.3.1. Hit Rate Threshold Cache Size Decrease Configuration</h4>

<p>if <code>decr_mode</code> is <code>H5C_decr__threshold</code>, 
  the cache size is decreased by the threshold algorithm, and the 
  remaining fields of the decrement section are used as follows:

<p><code>upper_hr_threshold</code> is the threshold above which 
  the hit rate must rise to trigger cache size reduction.  It must 
  be in the range [0.0, 1.0].  In my synthetic tests, very high
  values like .9995 or .99995 seemed to work best.

<p><code>decrement</code> is the factor by which the current 
  maximum cache size is multiplied to obtain a tentative new 
  maximum cache size.  It must lie in the range [0.0, 1.0].  
  Relatively large values like .9 seem to work best in my synthetic
  tests.  Note that the actual size reduction may be smaller
  as required by <code>min_size</code> and <code>max_decrement</code> 
  (discussed below).

<!-- NEW PAGE -->
<p><code>apply_max_decrement</code> and <code>max_decrement</code> 
  allow the user to specify a maximum decrement.  If 
  <code>apply_max_decrement</code> is <code>TRUE</code>, cache 
  size will never be reduced by more than <code>max_decrement</code>
  bytes in any single reduction.

<p>With the hit rate threshold cache size decrement algorithm,
  the remaining fields in the section are ignored.

<h4>4.3.2. Ageout Cache Size Reduction</h4>

<p>If <code>decr_mode</code> is <code>H5C_decr__age_out</code> 
  the cache size is decreased by the ageout algorithm, and the 
  remaining fields of the decrement section are used as follows:

<p><code>epochs_before_eviction</code> is the number of epochs an 
  entry must reside unaccessed in the cache before it is evicted.
  This value must lie in the range [1, <code>H5C__MAX_EPOCH_MARKERS</code>].
  <code>H5C__MAX_EPOCH_MARKERS</code> is defined in 
  <code>H5Cprivate.h</code>, and is currently set to 10.

<p><code>apply_max_decrement</code> and <code>max_decrement</code> are 
  used as in section 2.4.3.1.

<p><code>apply_empty_reserve</code> and <code>empty_reserve</code> 
  allow the user to specify a minimum empty reserve as discussed in 
  section 2.3.2.2.  An empty reserve of 0.05 or 0.1 seems to work
  well.

<p>The <code>decrement</code> and <code>upper_hr_threshold</code> 
  fields are ignored in this case.

<h4>4.3.3. Ageout With Hit Rate Threshold Cache Size Reduction</h4>

<p>If <code>decr_mode</code> is 
  <code>H5C_decr__age_out_with_threshold</code>, the cache
  size is decreased by the ageout with hit rate threshold algorithm,
  and the fields of decrement section are used as per the Ageout
  algorithm (see 5.3.2) with the exception of 
  <code>upper_hr_threshold</code>.

<p>Here, <code>upper_hr_threshold</code> is the threshold above 
  which the hit rate must rise to trigger cache size reduction.  
  It must be in the range [0.0, 1.0].  In my synthetic tests, 
  high values like .999 seemed to work well.

<h3>4.4. Parallel Configuration</h3>

<p>This section is a catch-all for parallel specific
  configuration data.  At present, it has only one field --
  <code>dirty_bytes_threshold</code>.

<p>In PHDF5, all operations that modify metadata must be
  executed collectively.  We used to think that this was
  enough to ensure consistency across the metadata caches, but
  since we allow processes to read metadata individually, the
  order of dirty entries in the LRU list can vary across
  processes.  This in turn can change the order in which dirty
  metadata cache entries reach the bottom of the LRU and are
  flushed to disk -- opening the door to messages from the past
  and messages from the future bugs.

<p>To prevent this, only the metadata cache on process 0 of
  the file communicator is allowed to write to file, and then
  only after entering a sync point with the other caches.
  After it writes entries to file, it sends the base addresses
  of the now clean entries to the other caches, so they can mark
  these entries clean as well, and then leaves the sync point.
  The other caches mark the specified entries as clean before
  they leave the synch point as well. (Observe, that since all
  caches see the same stream of dirty metadata, they will all
  have the same set of dirty entries upon sync point entry and
  exit.)

<p>The different caches know when to synchronize by counting
  the number of bytes of dirty metadata created by the
  collective operations modifying metadata.  Whenever this count
  exceeds the value specified in the dirty_bytes_threshold, they
  all enter the sync point, and process 0 flushes down to its
  minimum clean size and sends the list of newly cleaned entries
  to the other caches.

<p>Needless to say, the value of the <code>dirty_bytes_threshold</code>
  field must be consistent across all the caches operating on
  a given file.

<p>All dirty metadata can also by flushed under programatic
  control via the H5Fflush() call.  This call must be collective,
  and will reset the dirty data counts on each metadata cache.

<p>Absent calls to H5Fflush(), dirty metadata will only be
  flushed when the <code>dirty_bytes_threshold</code> is exceeded, 
  and then only down to the min_clean_fraction.  Thus, if a program
  does all its metadata modifications in one phase, and then
  doesn't modify metadata thereafter, a residue of dirty metadata
  will be frozen in the metadata caches for the remainder of the
  computation -- effectively reducing the sizes of the caches.

<p>In the default configuration, the caches will eventually
  resize themselves to maintain an acceptable hit rate.  However,
  this will take time, and it will increase the applications
  footprint in memory.

<p>If your application behaves in this manner, you can avoid
  this by a collective call to H5Fflush() immediately after the
  metadata modification phase.

<a name="MDC_Interactions">
<h3>4.5. Interactions</h3>
</a>

<p>Evictions may not be disabled unless the automatic cache
  resize code is disabled as well (by setting <code>decr_mode</code>
  to <code>H5C_decr__off</code>, <code>flash_decr_mode</code> to
  <code>H5C_flash_incr__add_space</code>, and <code>incr_mode</code> to 
  <code>H5C_incr__off</code>) -- thus placing the cache size under 
  the direct control of the user program.

<p>There is no logical necessity for this restriction.  It is
  imposed because it simplifies testing greatly, and because I
  can't see any reason why one would want to disable evictions
  while the automatic cache size adjustment code was enabled.
  This restriction can be relaxed if anyone can come up with a
  good reason to do so.

<p>At present there are two interactions between the
  increment and decrement sections of the configuration.

<p>If <code>incr_mode</code> is <code>H5C_incr__threshold</code>, 
  and <code>decr_mode</code> is either <code>H5C_decr__threshold</code>
  or <code>H5C_decr__age_out_with_threshold</code>, then
  <code>lower_hr_threshold</code> must be strictly less than 
  <code>upper_hr_threshold</code>.

<p>Also, if the flash cache size increment code is enabled and is 
  triggered, it will restart the current epoch without calling any
  other cache size increment or decrement code.

<p>In both the serial and parallel cases, there is the potential for 
  an interaction between the <code>min_clean_fraction</code> and
  the cache size increment code that can severly degrade performance.
  Specifically, if the 
  <code>min_clean_fraction</code> is large enough, it is possible that
  keeping the specified fraction of the cache clean may generate 
  enough flushes to seriously degrade performance even though the 
  hit rate is excellent.

<p>In the serial case, this is easily dealt with by selecting a very
  small <code>min_clean_fraction</code> -- 0.01 for example -- as 
  this still avoids the "metadata blizzard" phenomenon that appears
  when the cache fills with dirty metadata and must then flush all 
  of it before evicting an entry to make space for a new entry.

<p>The problem is more difficult in the parallel case, as the 
  <code>min_clean_fraction</code> is used ensure that the cache 
  contains clean entries that can be evicted to make space for 
  new entries when metadata writes are forbidden -- i.e. between 
  sync points.

<p>This issue was discovered shortly before release 1.8.3 and
  an automated solution has not been implemented.
  Should it become an issue for an application, try manually setting
  the cache size to ~1.5 times the maximum working set size for the 
  application, and leave <code>min_clean_fraction</code> set to 0.3.

<p>You can approximate the working set size of your application via 
  repeated calls to <code>H5Fget_mdc_size()</code> and 
  <code>H5Fget_mdc_hit_rate()</code> while running your program with
  the cache resize code enabled.  The maximum value returned by 
  <code>H5Fget_mdc_size()</code> should be a reasonable approximation 
  -- particularly if the associated hit rate is good.

<!--
<p>I believe this interaction between <code>min_clean_fraction</code> 
  and the cache size increase code is quite susceptible to automated
  solutions, and hope to have a better solution for parallel HDF5 
  users in the 1.8.4 release.
-->

<p>In the parallel case, there is also an interaction between
  <code>min_clean_fraction</code> and <code>dirty_bytes_threshold</code>.  
  Absent calls to H5Fflush() (discussed above), the upper bound on the 
  amount of dirty data in the metadata caches will oscillate between
  (1 - <code>min_clean_fraction</code>) times current maximum cache size, 
  and that value plus the <code>dirty_bytes_threshold</code>.  Needless 
  to say, it will be best if the <code>min_size</code>, 
  <code>min_clean_fraction</code>, and the <code>dirty_bytes_threshold</code>
  are chosen so that the cache can't fill with dirty data.

<h3>4.6.  Default Metadata Cache Configuration</h3>

<p>Starting with release 1.8.3, HDF5 provides different default metadata cache
  configurations depending on whether the library is compiled for serial or
  parallel.  

<p>The default configuration for the serial case is as follows:
  
<pre>
{
  /* int         version                = */ H5C__CURR_AUTO_SIZE_CTL_VER,
  /* hbool_t     rpt_fcn_enabled        = */ FALSE,
  /* hbool_t     open_trace_file        = */ FALSE,
  /* hbool_t     close_trace_file       = */ FALSE,
  /* char        trace_file_name[]      = */ "",
  /* hbool_t     evictions_enabled      = */ TRUE,
  /* hbool_t     set_initial_size       = */ TRUE,
  /* size_t      initial_size           = */ ( 2 * 1024 * 1024),
  /* double      min_clean_fraction     = */ 0.01,
  /* size_t      max_size               = */ (32 * 1024 * 1024),
  /* size_t      min_size               = */ ( 1 * 1024 * 1024),
  /* long int    epoch_length           = */ 50000,
  /* enum H5C_cache_incr_mode incr_mode = */ H5C_incr__threshold,
  /* double      lower_hr_threshold     = */ 0.9,
  /* double      increment              = */ 2.0,
  /* hbool_t     apply_max_increment    = */ TRUE,
  /* size_t      max_increment          = */ (4 * 1024 * 1024),
  /* enum H5C_cache_flash_incr_mode       */
  /*                    flash_incr_mode = */ H5C_flash_incr__add_space,
  /* double      flash_multiple         = */ 1.4,
  /* double      flash_threshold        = */ 0.25,
  /* enum H5C_cache_decr_mode decr_mode = */ H5C_decr__age_out_with_threshold,
  /* double      upper_hr_threshold     = */ 0.999,
  /* double      decrement              = */ 0.9,
  /* hbool_t     apply_max_decrement    = */ TRUE,
  /* size_t      max_decrement          = */ (1 * 1024 * 1024),
  /* int         epochs_before_eviction = */ 3,
  /* hbool_t     apply_empty_reserve    = */ TRUE,
  /* double      empty_reserve          = */ 0.1,
  /* int         dirty_bytes_threshold  = */ (256 * 1024)
}
</pre>

<p>The default configuration for the parallel case is as follows:
  
<pre>
{
  /* int         version                = */ H5C__CURR_AUTO_SIZE_CTL_VER,
  /* hbool_t     rpt_fcn_enabled        = */ FALSE,
  /* hbool_t     open_trace_file        = */ FALSE,
  /* hbool_t     close_trace_file       = */ FALSE,
  /* char        trace_file_name[]      = */ "",
  /* hbool_t     evictions_enabled      = */ TRUE,
  /* hbool_t     set_initial_size       = */ TRUE,
  /* size_t      initial_size           = */ ( 2 * 1024 * 1024),
  /* double      min_clean_fraction     = */ 0.3,
  /* size_t      max_size               = */ (32 * 1024 * 1024),
  /* size_t      min_size               = */ ( 1 * 1024 * 1024),
  /* long int    epoch_length           = */ 50000,
  /* enum H5C_cache_incr_mode incr_mode = */ H5C_incr__threshold,
  /* double      lower_hr_threshold     = */ 0.9,
  /* double      increment              = */ 2.0,
  /* hbool_t     apply_max_increment    = */ TRUE,
  /* size_t      max_increment          = */ (4 * 1024 * 1024),
  /* enum H5C_cache_flash_incr_mode       */
  /*                    flash_incr_mode = */ H5C_flash_incr__add_space,
  /* double      flash_multiple         = */ 1.0,
  /* double      flash_threshold        = */ 0.25,
  /* enum H5C_cache_decr_mode decr_mode = */ H5C_decr__age_out_with_threshold,
  /* double      upper_hr_threshold     = */ 0.999,
  /* double      decrement              = */ 0.9,
  /* hbool_t     apply_max_decrement    = */ TRUE,
  /* size_t      max_decrement          = */ (1 * 1024 * 1024),
  /* int         epochs_before_eviction = */ 3,
  /* hbool_t     apply_empty_reserve    = */ TRUE,
  /* double      empty_reserve          = */ 0.1,
  /* int         dirty_bytes_threshold  = */ (256 * 1024)
}
</pre>


<p>The default serial configuration should be adequate 
  for most serial HDF5 users.

<p>The same may not be true for the default parallel configuration
  due the interaction between the 
  <code>min_clean_fraction</code> and the cache size increase code.
  See the 
  &ldquo;<a href="#MDC_Interactions">Interactions</a>&rdquo; 
  section for further details.

<p>Should you need to change the default configuration, it can be found in 
  <code>H5ACprivate.h</code>. Look for the definition of 
  <code>H5AC__DEFAULT_RESIZE_CONFIG</code>.
  
<h2>5. Controlling the New Metadata Cache Size From Your Program</h2>

<p>You have already seen how <code>H5AC_cache_config_t</code> has 
  facilities that allow you to control the metadata cache size directly.  Use
  <code>H5Fget_mdc_config()</code> and <code>H5Fset_mdc_config()</code> 
  to get and set the metadata cache configuration on an open file.  Use 
  <code>H5Pget_mdc_config()</code> and <code>H5Pset_mdc_config()</code> 
  to get and set the initial metadata cache configuration in a file 
  access property list.  Recall that this list contains configuration 
  data used when opening a file.

<p>Use <code>H5Fget_mdc_hit_rate()</code> to get the average hit rate 
  since the last time the hit rate stats were reset.  This happens 
  automatically at the beginning of each epoch if the adaptive cache 
  resize code is enabled.  You can also do it manually with 
  <code>H5Freset_mdc_hit_rate_stats()</code>.  Be careful about doing 
  this if the adaptive cache resize code is enabled, as you may confuse it.

<p>Use <code>H5Fget_mdc_size()</code> to get metadata cache size data 
  on an open file.

<p>Finally, note that cache size and cache footprint are two different
  things -- in my tests, the cache footprint (as inferred from the UNIX
  top command) is typically about three times the maximum cache size.  I
  haven't tracked it down yet, but I would guess that most of this is due
  to the very small typical cache entry size combined with the rather
  large size of cache entry header structure.  This should be investigated
  further, but there are other matters of higher priority.

<h2>6. New Metadata Cache Debugging Facilities</h2>

<p>The new metadata cache has a variety of debugging facilities
  that may be of use.  I doubt that any other than the report function
  and the trace file will ever be accessible via the API, but they are
  relatively easy to turn on in the source code.

<p>Note that none of this should be viewed as supported -- it is
  described here on the off chance that you want to use it, but you are
  on your own if you do.  Also, there are no promises as to consistency
  between versions.

<p>As mentioned above, you can use the <code>rpt_fcn_enabled</code> 
  field of the configuration structure to enable the default reporting 
  function (<code>H5C_def_auto_resize_rpt_fcn()</code> in <code>H5C.c</code>).
  If this function doesn't work for you, you will have to write your own.  
  In particular, remember that it uses stdout, so it will probably be 
  unhappy under Windows.

<p>Again, remember that this facility is not supported.  Further,
  it is likely to change every time I do any serious work on the cache.

<p>There is also extensive statistics collection code.  Use
  <code>H5C_COLLECT_CACHE_STATS</code> and 
  <code>H5C_COLLECT_CACHE_ENTRY_STATS</code> in <code>H5Cprivate.h</code>
  to turn this on.  If you also turn on <code>H5AC_DUMP_STATS_ON_CLOSE</code> 
  in H5ACprivate.h, stats will be dumped when you close a file.  
  Alternatively you can call <code>H5C_stats()</code> and 
  <code>H5C_stats__reset()</code> within the library to dump 
  and reset stats.  Both of these functions are defined in 
  <code>H5C.c</code>.

<p>Finally, the cache also contains extensive sanity checking
  code.  Much of this is turned on when you compile in debug mode,
  but to enable the full suite, turn on <code>H5C_DO_SANITY_CHECKS</code>
  in <code>H5Cprivate.h</code>
  
<h2>7. Trouble Shooting</h2>

<p>Absent major bugs in the cache, the only trouble shooting you
  should have to do is diagnosing and fixing problems with your cache
  configuration.

<p>Assuming it runs on your platform (I've only used it under Linux),
  the reporting function is probably the most convenient diagnosis tool.
  However, since it is unsupported code, I will not discuss it further
  beyond directing you to the source 
  (<code>H5C_def_auto_resize_rpt_fcn()</code> in <code>H5C.c</code>).

<p>Absent the reporting function, regular calls to 
  <code>H5Fget_mdc_hit_rate()</code> should give you a good idea of 
  hit rate over time.  Remember that the hit rate stats are reset at 
  the end of each epoch (when adaptive cache resizing is enabled), so 
  you should expect some jitter.

<p>Similar calls to <code>H5Fget_mdc_size()</code> should allow you 
  to monitor cache size, and the fraction of the current maximum 
  cache size that is actually in use.

<p>If the hit rate is consistently low, and the cache it at its
  current maximum size, increasing the maximum size is an obvious fix.

<p>If you see hit rate and cache size oscillations, try disabling
  adaptive cache resizing and setting a fixed cache size a bit greater
  than the high end of the cache size oscillations you observed.

<p>If the hit rate oscillations don't go away, you are probably looking
  at a feature of your application which can't be helped without major
  changes to the cache.  Please send along a description of the situation.

<p>If the oscillations do go away, you may be able to come up with a
  configuration that deals with the situation.  If that fails, control
  cache size manually, and write me, so I can try to develop an adaptive
  resize algorithm that works in your case.

<p>Needless to say, you should give the cache a few epochs to adapt
  to circumstances.  If that is too slow for you, try manual cache
  size control.

<p>If you find it necessary to disable evictions, you may find it
  useful to enable the internal statistics collection code mentioned
  above in the section on debugging facilities.

<p>Amongst many other other things, the stats code will report the
  the maximum cache size, and the average successful and unsuccessful
  search depths in the hash table.  If these latter figures are
  significantly above 1, you should increase the size of the hash
  table.

<hr><br />
<p>Return to the <a href="../../Advanced.html">"Advanced Topics"</a> page.</p>
<br /><br />

<hr>
<!-- #BeginLibraryItem "/ed_libs/Footer.lbi" -->
<address>
<table width="100%" border="0">
  <tr valign="top">
      <td align="left">
          <address>
          The HDF Group Help Desk: <img src="../../Graphics/help.png" align=top height=16>
          <br>
          Describes HDF5 Release 1.8.13, May 2014.
          </address>
      </td><td width="5%">&nbsp;</td>
      <td align="right">
          <a href="../../Copyright.html">Copyright</a> by
          <a href="http://www.hdfgroup.org">The HDF Group</a>
          <br>
          and the Board of Trustees of the University of Illinois
      </td>   
  </tr>   
</table>
</address>
<!-- #EndLibraryItem --><html><SCRIPT LANGUAGE="JAVASCRIPT">
<!--
document.writeln("Last modified: 5 October 2010");
-->
</SCRIPT>
 
  
</body>
</html>