1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
|
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 2002,2010 Oracle. All rights reserved.
*
* $Id: SR12885Test.java,v 1.8.2.2 2010/01/04 15:30:42 cwl Exp $
*/
package com.sleepycat.je.cleaner;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.log.FileManager;
import com.sleepycat.je.util.TestUtils;
/**
* Reproduces a problem found in SR12885 where we failed to migrate a pending
* LN if the slot was reused by an active transaction and that transaction was
* later aborted.
*
* This bug can manifest as a LogNotFoundException. However, there was another
* bug that caused this bug to manifest sometimes as a NOTFOUND return value.
* This secondary problem -- more sloppyness than a real bug -- was that the
* PendingDeleted flag was not cleared during an abort. If the PendingDeleted
* flag is set, the low level fetch method will return null rather than
* throwing a LogFileNotFoundException. This caused a NOTFOUND in some cases.
*
* The sequence that causes the bug is:
*
* 1) The cleaner processes a file containing LN-A (node A) for key X. Key X
* is a non-deleted LN.
*
* 2) The cleaner sets the migrate flag on the BIN entry for LN-A.
*
* 3) In transaction T-1, LN-A is deleted and replaced by LN-B with key X,
* reusing the same slot but assigning a new node ID. At this point both node
* IDs (LN-A and LN-B) are locked.
*
* 4) The cleaner (via a checkpoint or eviction that logs the BIN) tries to
* migrate LN-B, the current LN in the BIN, but finds it locked. It adds LN-B
* to the pending LN list.
*
* 5) T-1 aborts, putting the LSN of LN-A back into the BIN slot.
*
* 6) In transaction T-2, LN-A is deleted and replaced by LN-C with key X,
* reusing the same slot but assigning a new node ID. At this point both node
* IDs (LN-A and LN-C) are locked.
*
* 7) The cleaner (via a checkpoint or wakeup) processes the pending LN-B. It
* first gets a lock on node B, then does the tree lookup. It finds LN-C in
* the tree, but it doesn't notice that it has a different node ID than the
* node it locked.
*
* 8) The cleaner sees that LN-C is deleted, and therefore no migration is
* necessary -- this is incorrect. It removes LN-B from the pending list,
* allowing the cleaned file to be deleted.
*
* 9) T-2 aborts, putting the LSN of LN-A back into the BIN slot.
*
* 10) A fetch of key X will fail, since the file containing the LSN for LN-A
* has been deleted. If we didn't clear the PendingDeleted flag, this will
* cause a NOTFOUND error instead of a LogFileNotFoundException.
*/
public class SR12885Test extends TestCase {
private static final String DB_NAME = "foo";
private static final CheckpointConfig forceConfig = new CheckpointConfig();
static {
forceConfig.setForce(true);
}
private File envHome;
private Environment env;
private Database db;
public SR12885Test() {
envHome = new File(System.getProperty(TestUtils.DEST_DIR));
}
public void setUp()
throws IOException, DatabaseException {
TestUtils.removeLogFiles("Setup", envHome, false);
TestUtils.removeFiles("Setup", envHome, FileManager.DEL_SUFFIX);
}
public void tearDown()
throws IOException, DatabaseException {
try {
if (env != null) {
env.close();
}
} catch (Throwable e) {
System.out.println("tearDown: " + e);
}
try {
TestUtils.removeLogFiles("tearDown", envHome, true);
TestUtils.removeFiles("tearDown", envHome, FileManager.DEL_SUFFIX);
} catch (Throwable e) {
System.out.println("tearDown: " + e);
}
db = null;
env = null;
envHome = null;
}
/**
* Opens the environment and database.
*/
private void openEnv()
throws DatabaseException {
EnvironmentConfig config = TestUtils.initEnvConfig();
DbInternal.disableParameterValidation(config);
config.setTransactional(true);
config.setAllowCreate(true);
/* Do not run the daemons. */
config.setConfigParam
(EnvironmentParams.ENV_RUN_CLEANER.getName(), "false");
config.setConfigParam
(EnvironmentParams.ENV_RUN_EVICTOR.getName(), "false");
config.setConfigParam
(EnvironmentParams.ENV_RUN_CHECKPOINTER.getName(), "false");
config.setConfigParam
(EnvironmentParams.ENV_RUN_INCOMPRESSOR.getName(), "false");
/* Use a small log file size to make cleaning more frequent. */
config.setConfigParam(EnvironmentParams.LOG_FILE_MAX.getName(),
Integer.toString(1024));
env = new Environment(envHome, config);
openDb();
}
/**
* Opens that database.
*/
private void openDb()
throws DatabaseException {
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setTransactional(true);
dbConfig.setAllowCreate(true);
db = env.openDatabase(null, DB_NAME, dbConfig);
}
/**
* Closes the environment and database.
*/
private void closeEnv()
throws DatabaseException {
if (db != null) {
db.close();
db = null;
}
if (env != null) {
env.close();
env = null;
}
}
public void testSR12885()
throws DatabaseException {
openEnv();
final int COUNT = 10;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry(TestUtils.getTestArray(0));
OperationStatus status;
/* Add some records, enough to fill a log file. */
for (int i = 0; i < COUNT; i += 1) {
key.setData(TestUtils.getTestArray(i));
status = db.putNoOverwrite(null, key, data);
assertEquals(OperationStatus.SUCCESS, status);
}
/*
* Delete all but key 0, so the first file can be cleaned but key 0
* will need to be migrated.
*/
for (int i = 1; i < COUNT; i += 1) {
key.setData(TestUtils.getTestArray(i));
status = db.delete(null, key);
assertEquals(OperationStatus.SUCCESS, status);
}
/*
* Checkpoint and clean to set the migrate flag for key 0. This must
* be done when key 0 is not locked, so that it will not be put onto
* the pending list yet. Below we cause it to be put onto the pending
* list with a different node ID.
*/
env.checkpoint(forceConfig);
int cleaned = env.cleanLog();
assertTrue("cleaned=" + cleaned, cleaned > 0);
/*
* Using a transaction, delete then insert key 0, reusing the slot.
* The insertion assigns a new node ID. Don't abort the transaction
* until after the cleaner migration is finished.
*/
Transaction txn = env.beginTransaction(null, null);
key.setData(TestUtils.getTestArray(0));
status = db.delete(txn, key);
assertEquals(OperationStatus.SUCCESS, status);
status = db.putNoOverwrite(txn, key, data);
assertEquals(OperationStatus.SUCCESS, status);
/*
* Checkpoint again to perform LN migration. LN migration will not
* migrate key 0 because it is locked -- it will be put onto the
* pending list. But the LN put on the pending list will be the newly
* inserted node, which has a different node ID than the LN that needs
* to be migrated -- this is the first condition for the bug.
*/
env.checkpoint(forceConfig);
/*
* Abort the transaction to revert to the original node ID for key 0.
* Then perform a delete with a new transaction. This makes the
* current LN for key 0 deleted.
*/
txn.abort();
txn = env.beginTransaction(null, null);
key.setData(TestUtils.getTestArray(0));
status = db.delete(txn, key);
assertEquals(OperationStatus.SUCCESS, status);
/*
* The current state of key 0 is that the BIN contains a deleted LN,
* and that LN has a node ID that is different than the one in the
* pending LN list. This node is the one that needs to be migrated.
*
* Perform a checkpoint to cause pending LNs to be processed and then
* delete the cleaned file. When we process the pending LN, we'll lock
* the pending LN's node ID (the one we inserted and aborted), which is
* the wrong node ID. We'll then examine the current LN, find it
* deleted, and neglect to migrate the LN that needs to be migrated.
* The error is that we don't lock the node ID of the current LN.
*
* Then abort the delete transaction. That will revert the BIN entry
* to the node we failed to migrate. If we then try to fetch key 0,
* we'll get LogNotFoundException.
*/
env.checkpoint(forceConfig);
txn.abort();
status = db.get(null, key, data, null);
assertEquals(OperationStatus.SUCCESS, status);
/* If we get this far without LogNotFoundException, it's fixed. */
closeEnv();
}
}
|