000001 /* 000002 ** 2004 April 6 000003 ** 000004 ** The author disclaims copyright to this source code. In place of 000005 ** a legal notice, here is a blessing: 000006 ** 000007 ** May you do good and not evil. 000008 ** May you find forgiveness for yourself and forgive others. 000009 ** May you share freely, never taking more than you give. 000010 ** 000011 ************************************************************************* 000012 ** This file implements an external (disk-based) database using BTrees. 000013 ** See the header comment on "btreeInt.h" for additional information. 000014 ** Including a description of file format and an overview of operation. 000015 */ 000016 #include "btreeInt.h" 000017 000018 /* 000019 ** The header string that appears at the beginning of every 000020 ** SQLite database. 000021 */ 000022 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 000023 000024 /* 000025 ** Set this global variable to 1 to enable tracing using the TRACE 000026 ** macro. 000027 */ 000028 #if 0 000029 int sqlite3BtreeTrace=1; /* True to enable tracing */ 000030 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 000031 #else 000032 # define TRACE(X) 000033 #endif 000034 000035 /* 000036 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 000037 ** But if the value is zero, make it 65536. 000038 ** 000039 ** This routine is used to extract the "offset to cell content area" value 000040 ** from the header of a btree page. If the page size is 65536 and the page 000041 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 000042 ** This routine makes the necessary adjustment to 65536. 000043 */ 000044 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 000045 000046 /* 000047 ** Values passed as the 5th argument to allocateBtreePage() 000048 */ 000049 #define BTALLOC_ANY 0 /* Allocate any page */ 000050 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */ 000051 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */ 000052 000053 /* 000054 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 000055 ** defined, or 0 if it is. For example: 000056 ** 000057 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum); 000058 */ 000059 #ifndef SQLITE_OMIT_AUTOVACUUM 000060 #define IfNotOmitAV(expr) (expr) 000061 #else 000062 #define IfNotOmitAV(expr) 0 000063 #endif 000064 000065 #ifndef SQLITE_OMIT_SHARED_CACHE 000066 /* 000067 ** A list of BtShared objects that are eligible for participation 000068 ** in shared cache. This variable has file scope during normal builds, 000069 ** but the test harness needs to access it so we make it global for 000070 ** test builds. 000071 ** 000072 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. 000073 */ 000074 #ifdef SQLITE_TEST 000075 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000076 #else 000077 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000078 #endif 000079 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000080 000081 #ifndef SQLITE_OMIT_SHARED_CACHE 000082 /* 000083 ** Enable or disable the shared pager and schema features. 000084 ** 000085 ** This routine has no effect on existing database connections. 000086 ** The shared cache setting effects only future calls to 000087 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 000088 */ 000089 int sqlite3_enable_shared_cache(int enable){ 000090 sqlite3GlobalConfig.sharedCacheEnabled = enable; 000091 return SQLITE_OK; 000092 } 000093 #endif 000094 000095 000096 000097 #ifdef SQLITE_OMIT_SHARED_CACHE 000098 /* 000099 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 000100 ** and clearAllSharedCacheTableLocks() 000101 ** manipulate entries in the BtShared.pLock linked list used to store 000102 ** shared-cache table level locks. If the library is compiled with the 000103 ** shared-cache feature disabled, then there is only ever one user 000104 ** of each BtShared structure and so this locking is not necessary. 000105 ** So define the lock related functions as no-ops. 000106 */ 000107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 000108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 000109 #define clearAllSharedCacheTableLocks(a) 000110 #define downgradeAllSharedCacheTableLocks(a) 000111 #define hasSharedCacheTableLock(a,b,c,d) 1 000112 #define hasReadConflicts(a, b) 0 000113 #endif 000114 000115 /* 000116 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single 000117 ** (MemPage*) as an argument. The (MemPage*) must not be NULL. 000118 ** 000119 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to 000120 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message 000121 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented 000122 ** with the page number and filename associated with the (MemPage*). 000123 */ 000124 #ifdef SQLITE_DEBUG 000125 int corruptPageError(int lineno, MemPage *p){ 000126 char *zMsg; 000127 sqlite3BeginBenignMalloc(); 000128 zMsg = sqlite3_mprintf("database corruption page %d of %s", 000129 (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) 000130 ); 000131 sqlite3EndBenignMalloc(); 000132 if( zMsg ){ 000133 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg); 000134 } 000135 sqlite3_free(zMsg); 000136 return SQLITE_CORRUPT_BKPT; 000137 } 000138 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage) 000139 #else 000140 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno) 000141 #endif 000142 000143 #ifndef SQLITE_OMIT_SHARED_CACHE 000144 000145 #ifdef SQLITE_DEBUG 000146 /* 000147 **** This function is only used as part of an assert() statement. *** 000148 ** 000149 ** Check to see if pBtree holds the required locks to read or write to the 000150 ** table with root page iRoot. Return 1 if it does and 0 if not. 000151 ** 000152 ** For example, when writing to a table with root-page iRoot via 000153 ** Btree connection pBtree: 000154 ** 000155 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 000156 ** 000157 ** When writing to an index that resides in a sharable database, the 000158 ** caller should have first obtained a lock specifying the root page of 000159 ** the corresponding table. This makes things a bit more complicated, 000160 ** as this module treats each table as a separate structure. To determine 000161 ** the table corresponding to the index being written, this 000162 ** function has to search through the database schema. 000163 ** 000164 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 000165 ** hold a write-lock on the schema table (root page 1). This is also 000166 ** acceptable. 000167 */ 000168 static int hasSharedCacheTableLock( 000169 Btree *pBtree, /* Handle that must hold lock */ 000170 Pgno iRoot, /* Root page of b-tree */ 000171 int isIndex, /* True if iRoot is the root of an index b-tree */ 000172 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 000173 ){ 000174 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 000175 Pgno iTab = 0; 000176 BtLock *pLock; 000177 000178 /* If this database is not shareable, or if the client is reading 000179 ** and has the read-uncommitted flag set, then no lock is required. 000180 ** Return true immediately. 000181 */ 000182 if( (pBtree->sharable==0) 000183 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit)) 000184 ){ 000185 return 1; 000186 } 000187 000188 /* If the client is reading or writing an index and the schema is 000189 ** not loaded, then it is too difficult to actually check to see if 000190 ** the correct locks are held. So do not bother - just return true. 000191 ** This case does not come up very often anyhow. 000192 */ 000193 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){ 000194 return 1; 000195 } 000196 000197 /* Figure out the root-page that the lock should be held on. For table 000198 ** b-trees, this is just the root page of the b-tree being read or 000199 ** written. For index b-trees, it is the root page of the associated 000200 ** table. */ 000201 if( isIndex ){ 000202 HashElem *p; 000203 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 000204 Index *pIdx = (Index *)sqliteHashData(p); 000205 if( pIdx->tnum==(int)iRoot ){ 000206 if( iTab ){ 000207 /* Two or more indexes share the same root page. There must 000208 ** be imposter tables. So just return true. The assert is not 000209 ** useful in that case. */ 000210 return 1; 000211 } 000212 iTab = pIdx->pTable->tnum; 000213 } 000214 } 000215 }else{ 000216 iTab = iRoot; 000217 } 000218 000219 /* Search for the required lock. Either a write-lock on root-page iTab, a 000220 ** write-lock on the schema table, or (if the client is reading) a 000221 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 000222 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 000223 if( pLock->pBtree==pBtree 000224 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 000225 && pLock->eLock>=eLockType 000226 ){ 000227 return 1; 000228 } 000229 } 000230 000231 /* Failed to find the required lock. */ 000232 return 0; 000233 } 000234 #endif /* SQLITE_DEBUG */ 000235 000236 #ifdef SQLITE_DEBUG 000237 /* 000238 **** This function may be used as part of assert() statements only. **** 000239 ** 000240 ** Return true if it would be illegal for pBtree to write into the 000241 ** table or index rooted at iRoot because other shared connections are 000242 ** simultaneously reading that same table or index. 000243 ** 000244 ** It is illegal for pBtree to write if some other Btree object that 000245 ** shares the same BtShared object is currently reading or writing 000246 ** the iRoot table. Except, if the other Btree object has the 000247 ** read-uncommitted flag set, then it is OK for the other object to 000248 ** have a read cursor. 000249 ** 000250 ** For example, before writing to any part of the table or index 000251 ** rooted at page iRoot, one should call: 000252 ** 000253 ** assert( !hasReadConflicts(pBtree, iRoot) ); 000254 */ 000255 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 000256 BtCursor *p; 000257 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000258 if( p->pgnoRoot==iRoot 000259 && p->pBtree!=pBtree 000260 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit) 000261 ){ 000262 return 1; 000263 } 000264 } 000265 return 0; 000266 } 000267 #endif /* #ifdef SQLITE_DEBUG */ 000268 000269 /* 000270 ** Query to see if Btree handle p may obtain a lock of type eLock 000271 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 000272 ** SQLITE_OK if the lock may be obtained (by calling 000273 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 000274 */ 000275 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 000276 BtShared *pBt = p->pBt; 000277 BtLock *pIter; 000278 000279 assert( sqlite3BtreeHoldsMutex(p) ); 000280 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000281 assert( p->db!=0 ); 000282 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 ); 000283 000284 /* If requesting a write-lock, then the Btree must have an open write 000285 ** transaction on this file. And, obviously, for this to be so there 000286 ** must be an open write transaction on the file itself. 000287 */ 000288 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 000289 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 000290 000291 /* This routine is a no-op if the shared-cache is not enabled */ 000292 if( !p->sharable ){ 000293 return SQLITE_OK; 000294 } 000295 000296 /* If some other connection is holding an exclusive lock, the 000297 ** requested lock may not be obtained. 000298 */ 000299 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){ 000300 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 000301 return SQLITE_LOCKED_SHAREDCACHE; 000302 } 000303 000304 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000305 /* The condition (pIter->eLock!=eLock) in the following if(...) 000306 ** statement is a simplification of: 000307 ** 000308 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 000309 ** 000310 ** since we know that if eLock==WRITE_LOCK, then no other connection 000311 ** may hold a WRITE_LOCK on any table in this file (since there can 000312 ** only be a single writer). 000313 */ 000314 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 000315 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 000316 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 000317 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 000318 if( eLock==WRITE_LOCK ){ 000319 assert( p==pBt->pWriter ); 000320 pBt->btsFlags |= BTS_PENDING; 000321 } 000322 return SQLITE_LOCKED_SHAREDCACHE; 000323 } 000324 } 000325 return SQLITE_OK; 000326 } 000327 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000328 000329 #ifndef SQLITE_OMIT_SHARED_CACHE 000330 /* 000331 ** Add a lock on the table with root-page iTable to the shared-btree used 000332 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 000333 ** WRITE_LOCK. 000334 ** 000335 ** This function assumes the following: 000336 ** 000337 ** (a) The specified Btree object p is connected to a sharable 000338 ** database (one with the BtShared.sharable flag set), and 000339 ** 000340 ** (b) No other Btree objects hold a lock that conflicts 000341 ** with the requested lock (i.e. querySharedCacheTableLock() has 000342 ** already been called and returned SQLITE_OK). 000343 ** 000344 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 000345 ** is returned if a malloc attempt fails. 000346 */ 000347 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 000348 BtShared *pBt = p->pBt; 000349 BtLock *pLock = 0; 000350 BtLock *pIter; 000351 000352 assert( sqlite3BtreeHoldsMutex(p) ); 000353 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000354 assert( p->db!=0 ); 000355 000356 /* A connection with the read-uncommitted flag set will never try to 000357 ** obtain a read-lock using this function. The only read-lock obtained 000358 ** by a connection in read-uncommitted mode is on the sqlite_master 000359 ** table, and that lock is obtained in BtreeBeginTrans(). */ 000360 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK ); 000361 000362 /* This function should only be called on a sharable b-tree after it 000363 ** has been determined that no other b-tree holds a conflicting lock. */ 000364 assert( p->sharable ); 000365 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 000366 000367 /* First search the list for an existing lock on this table. */ 000368 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000369 if( pIter->iTable==iTable && pIter->pBtree==p ){ 000370 pLock = pIter; 000371 break; 000372 } 000373 } 000374 000375 /* If the above search did not find a BtLock struct associating Btree p 000376 ** with table iTable, allocate one and link it into the list. 000377 */ 000378 if( !pLock ){ 000379 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 000380 if( !pLock ){ 000381 return SQLITE_NOMEM_BKPT; 000382 } 000383 pLock->iTable = iTable; 000384 pLock->pBtree = p; 000385 pLock->pNext = pBt->pLock; 000386 pBt->pLock = pLock; 000387 } 000388 000389 /* Set the BtLock.eLock variable to the maximum of the current lock 000390 ** and the requested lock. This means if a write-lock was already held 000391 ** and a read-lock requested, we don't incorrectly downgrade the lock. 000392 */ 000393 assert( WRITE_LOCK>READ_LOCK ); 000394 if( eLock>pLock->eLock ){ 000395 pLock->eLock = eLock; 000396 } 000397 000398 return SQLITE_OK; 000399 } 000400 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000401 000402 #ifndef SQLITE_OMIT_SHARED_CACHE 000403 /* 000404 ** Release all the table locks (locks obtained via calls to 000405 ** the setSharedCacheTableLock() procedure) held by Btree object p. 000406 ** 000407 ** This function assumes that Btree p has an open read or write 000408 ** transaction. If it does not, then the BTS_PENDING flag 000409 ** may be incorrectly cleared. 000410 */ 000411 static void clearAllSharedCacheTableLocks(Btree *p){ 000412 BtShared *pBt = p->pBt; 000413 BtLock **ppIter = &pBt->pLock; 000414 000415 assert( sqlite3BtreeHoldsMutex(p) ); 000416 assert( p->sharable || 0==*ppIter ); 000417 assert( p->inTrans>0 ); 000418 000419 while( *ppIter ){ 000420 BtLock *pLock = *ppIter; 000421 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree ); 000422 assert( pLock->pBtree->inTrans>=pLock->eLock ); 000423 if( pLock->pBtree==p ){ 000424 *ppIter = pLock->pNext; 000425 assert( pLock->iTable!=1 || pLock==&p->lock ); 000426 if( pLock->iTable!=1 ){ 000427 sqlite3_free(pLock); 000428 } 000429 }else{ 000430 ppIter = &pLock->pNext; 000431 } 000432 } 000433 000434 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter ); 000435 if( pBt->pWriter==p ){ 000436 pBt->pWriter = 0; 000437 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000438 }else if( pBt->nTransaction==2 ){ 000439 /* This function is called when Btree p is concluding its 000440 ** transaction. If there currently exists a writer, and p is not 000441 ** that writer, then the number of locks held by connections other 000442 ** than the writer must be about to drop to zero. In this case 000443 ** set the BTS_PENDING flag to 0. 000444 ** 000445 ** If there is not currently a writer, then BTS_PENDING must 000446 ** be zero already. So this next line is harmless in that case. 000447 */ 000448 pBt->btsFlags &= ~BTS_PENDING; 000449 } 000450 } 000451 000452 /* 000453 ** This function changes all write-locks held by Btree p into read-locks. 000454 */ 000455 static void downgradeAllSharedCacheTableLocks(Btree *p){ 000456 BtShared *pBt = p->pBt; 000457 if( pBt->pWriter==p ){ 000458 BtLock *pLock; 000459 pBt->pWriter = 0; 000460 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000461 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000462 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 000463 pLock->eLock = READ_LOCK; 000464 } 000465 } 000466 } 000467 000468 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000469 000470 static void releasePage(MemPage *pPage); /* Forward reference */ 000471 static void releasePageOne(MemPage *pPage); /* Forward reference */ 000472 static void releasePageNotNull(MemPage *pPage); /* Forward reference */ 000473 000474 /* 000475 ***** This routine is used inside of assert() only **** 000476 ** 000477 ** Verify that the cursor holds the mutex on its BtShared 000478 */ 000479 #ifdef SQLITE_DEBUG 000480 static int cursorHoldsMutex(BtCursor *p){ 000481 return sqlite3_mutex_held(p->pBt->mutex); 000482 } 000483 000484 /* Verify that the cursor and the BtShared agree about what is the current 000485 ** database connetion. This is important in shared-cache mode. If the database 000486 ** connection pointers get out-of-sync, it is possible for routines like 000487 ** btreeInitPage() to reference an stale connection pointer that references a 000488 ** a connection that has already closed. This routine is used inside assert() 000489 ** statements only and for the purpose of double-checking that the btree code 000490 ** does keep the database connection pointers up-to-date. 000491 */ 000492 static int cursorOwnsBtShared(BtCursor *p){ 000493 assert( cursorHoldsMutex(p) ); 000494 return (p->pBtree->db==p->pBt->db); 000495 } 000496 #endif 000497 000498 /* 000499 ** Invalidate the overflow cache of the cursor passed as the first argument. 000500 ** on the shared btree structure pBt. 000501 */ 000502 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl) 000503 000504 /* 000505 ** Invalidate the overflow page-list cache for all cursors opened 000506 ** on the shared btree structure pBt. 000507 */ 000508 static void invalidateAllOverflowCache(BtShared *pBt){ 000509 BtCursor *p; 000510 assert( sqlite3_mutex_held(pBt->mutex) ); 000511 for(p=pBt->pCursor; p; p=p->pNext){ 000512 invalidateOverflowCache(p); 000513 } 000514 } 000515 000516 #ifndef SQLITE_OMIT_INCRBLOB 000517 /* 000518 ** This function is called before modifying the contents of a table 000519 ** to invalidate any incrblob cursors that are open on the 000520 ** row or one of the rows being modified. 000521 ** 000522 ** If argument isClearTable is true, then the entire contents of the 000523 ** table is about to be deleted. In this case invalidate all incrblob 000524 ** cursors open on any row within the table with root-page pgnoRoot. 000525 ** 000526 ** Otherwise, if argument isClearTable is false, then the row with 000527 ** rowid iRow is being replaced or deleted. In this case invalidate 000528 ** only those incrblob cursors open on that specific row. 000529 */ 000530 static void invalidateIncrblobCursors( 000531 Btree *pBtree, /* The database file to check */ 000532 Pgno pgnoRoot, /* The table that might be changing */ 000533 i64 iRow, /* The rowid that might be changing */ 000534 int isClearTable /* True if all rows are being deleted */ 000535 ){ 000536 BtCursor *p; 000537 if( pBtree->hasIncrblobCur==0 ) return; 000538 assert( sqlite3BtreeHoldsMutex(pBtree) ); 000539 pBtree->hasIncrblobCur = 0; 000540 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000541 if( (p->curFlags & BTCF_Incrblob)!=0 ){ 000542 pBtree->hasIncrblobCur = 1; 000543 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){ 000544 p->eState = CURSOR_INVALID; 000545 } 000546 } 000547 } 000548 } 000549 000550 #else 000551 /* Stub function when INCRBLOB is omitted */ 000552 #define invalidateIncrblobCursors(w,x,y,z) 000553 #endif /* SQLITE_OMIT_INCRBLOB */ 000554 000555 /* 000556 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 000557 ** when a page that previously contained data becomes a free-list leaf 000558 ** page. 000559 ** 000560 ** The BtShared.pHasContent bitvec exists to work around an obscure 000561 ** bug caused by the interaction of two useful IO optimizations surrounding 000562 ** free-list leaf pages: 000563 ** 000564 ** 1) When all data is deleted from a page and the page becomes 000565 ** a free-list leaf page, the page is not written to the database 000566 ** (as free-list leaf pages contain no meaningful data). Sometimes 000567 ** such a page is not even journalled (as it will not be modified, 000568 ** why bother journalling it?). 000569 ** 000570 ** 2) When a free-list leaf page is reused, its content is not read 000571 ** from the database or written to the journal file (why should it 000572 ** be, if it is not at all meaningful?). 000573 ** 000574 ** By themselves, these optimizations work fine and provide a handy 000575 ** performance boost to bulk delete or insert operations. However, if 000576 ** a page is moved to the free-list and then reused within the same 000577 ** transaction, a problem comes up. If the page is not journalled when 000578 ** it is moved to the free-list and it is also not journalled when it 000579 ** is extracted from the free-list and reused, then the original data 000580 ** may be lost. In the event of a rollback, it may not be possible 000581 ** to restore the database to its original configuration. 000582 ** 000583 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 000584 ** moved to become a free-list leaf page, the corresponding bit is 000585 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 000586 ** optimization 2 above is omitted if the corresponding bit is already 000587 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 000588 ** at the end of every transaction. 000589 */ 000590 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 000591 int rc = SQLITE_OK; 000592 if( !pBt->pHasContent ){ 000593 assert( pgno<=pBt->nPage ); 000594 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 000595 if( !pBt->pHasContent ){ 000596 rc = SQLITE_NOMEM_BKPT; 000597 } 000598 } 000599 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 000600 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 000601 } 000602 return rc; 000603 } 000604 000605 /* 000606 ** Query the BtShared.pHasContent vector. 000607 ** 000608 ** This function is called when a free-list leaf page is removed from the 000609 ** free-list for reuse. It returns false if it is safe to retrieve the 000610 ** page from the pager layer with the 'no-content' flag set. True otherwise. 000611 */ 000612 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 000613 Bitvec *p = pBt->pHasContent; 000614 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); 000615 } 000616 000617 /* 000618 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 000619 ** invoked at the conclusion of each write-transaction. 000620 */ 000621 static void btreeClearHasContent(BtShared *pBt){ 000622 sqlite3BitvecDestroy(pBt->pHasContent); 000623 pBt->pHasContent = 0; 000624 } 000625 000626 /* 000627 ** Release all of the apPage[] pages for a cursor. 000628 */ 000629 static void btreeReleaseAllCursorPages(BtCursor *pCur){ 000630 int i; 000631 if( pCur->iPage>=0 ){ 000632 for(i=0; i<pCur->iPage; i++){ 000633 releasePageNotNull(pCur->apPage[i]); 000634 } 000635 releasePageNotNull(pCur->pPage); 000636 pCur->iPage = -1; 000637 } 000638 } 000639 000640 /* 000641 ** The cursor passed as the only argument must point to a valid entry 000642 ** when this function is called (i.e. have eState==CURSOR_VALID). This 000643 ** function saves the current cursor key in variables pCur->nKey and 000644 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 000645 ** code otherwise. 000646 ** 000647 ** If the cursor is open on an intkey table, then the integer key 000648 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to 000649 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 000650 ** set to point to a malloced buffer pCur->nKey bytes in size containing 000651 ** the key. 000652 */ 000653 static int saveCursorKey(BtCursor *pCur){ 000654 int rc = SQLITE_OK; 000655 assert( CURSOR_VALID==pCur->eState ); 000656 assert( 0==pCur->pKey ); 000657 assert( cursorHoldsMutex(pCur) ); 000658 000659 if( pCur->curIntKey ){ 000660 /* Only the rowid is required for a table btree */ 000661 pCur->nKey = sqlite3BtreeIntegerKey(pCur); 000662 }else{ 000663 /* For an index btree, save the complete key content. It is possible 000664 ** that the current key is corrupt. In that case, it is possible that 000665 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by 000666 ** up to the size of 1 varint plus 1 8-byte value when the cursor 000667 ** position is restored. Hence the 17 bytes of padding allocated 000668 ** below. */ 000669 void *pKey; 000670 pCur->nKey = sqlite3BtreePayloadSize(pCur); 000671 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 ); 000672 if( pKey ){ 000673 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey); 000674 if( rc==SQLITE_OK ){ 000675 memset(((u8*)pKey)+pCur->nKey, 0, 9+8); 000676 pCur->pKey = pKey; 000677 }else{ 000678 sqlite3_free(pKey); 000679 } 000680 }else{ 000681 rc = SQLITE_NOMEM_BKPT; 000682 } 000683 } 000684 assert( !pCur->curIntKey || !pCur->pKey ); 000685 return rc; 000686 } 000687 000688 /* 000689 ** Save the current cursor position in the variables BtCursor.nKey 000690 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 000691 ** 000692 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 000693 ** prior to calling this routine. 000694 */ 000695 static int saveCursorPosition(BtCursor *pCur){ 000696 int rc; 000697 000698 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState ); 000699 assert( 0==pCur->pKey ); 000700 assert( cursorHoldsMutex(pCur) ); 000701 000702 if( pCur->eState==CURSOR_SKIPNEXT ){ 000703 pCur->eState = CURSOR_VALID; 000704 }else{ 000705 pCur->skipNext = 0; 000706 } 000707 000708 rc = saveCursorKey(pCur); 000709 if( rc==SQLITE_OK ){ 000710 btreeReleaseAllCursorPages(pCur); 000711 pCur->eState = CURSOR_REQUIRESEEK; 000712 } 000713 000714 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast); 000715 return rc; 000716 } 000717 000718 /* Forward reference */ 000719 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); 000720 000721 /* 000722 ** Save the positions of all cursors (except pExcept) that are open on 000723 ** the table with root-page iRoot. "Saving the cursor position" means that 000724 ** the location in the btree is remembered in such a way that it can be 000725 ** moved back to the same spot after the btree has been modified. This 000726 ** routine is called just before cursor pExcept is used to modify the 000727 ** table, for example in BtreeDelete() or BtreeInsert(). 000728 ** 000729 ** If there are two or more cursors on the same btree, then all such 000730 ** cursors should have their BTCF_Multiple flag set. The btreeCursor() 000731 ** routine enforces that rule. This routine only needs to be called in 000732 ** the uncommon case when pExpect has the BTCF_Multiple flag set. 000733 ** 000734 ** If pExpect!=NULL and if no other cursors are found on the same root-page, 000735 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another 000736 ** pointless call to this routine. 000737 ** 000738 ** Implementation note: This routine merely checks to see if any cursors 000739 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual) 000740 ** event that cursors are in need to being saved. 000741 */ 000742 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 000743 BtCursor *p; 000744 assert( sqlite3_mutex_held(pBt->mutex) ); 000745 assert( pExcept==0 || pExcept->pBt==pBt ); 000746 for(p=pBt->pCursor; p; p=p->pNext){ 000747 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; 000748 } 000749 if( p ) return saveCursorsOnList(p, iRoot, pExcept); 000750 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple; 000751 return SQLITE_OK; 000752 } 000753 000754 /* This helper routine to saveAllCursors does the actual work of saving 000755 ** the cursors if and when a cursor is found that actually requires saving. 000756 ** The common case is that no cursors need to be saved, so this routine is 000757 ** broken out from its caller to avoid unnecessary stack pointer movement. 000758 */ 000759 static int SQLITE_NOINLINE saveCursorsOnList( 000760 BtCursor *p, /* The first cursor that needs saving */ 000761 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */ 000762 BtCursor *pExcept /* Do not save this cursor */ 000763 ){ 000764 do{ 000765 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){ 000766 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 000767 int rc = saveCursorPosition(p); 000768 if( SQLITE_OK!=rc ){ 000769 return rc; 000770 } 000771 }else{ 000772 testcase( p->iPage>=0 ); 000773 btreeReleaseAllCursorPages(p); 000774 } 000775 } 000776 p = p->pNext; 000777 }while( p ); 000778 return SQLITE_OK; 000779 } 000780 000781 /* 000782 ** Clear the current cursor position. 000783 */ 000784 void sqlite3BtreeClearCursor(BtCursor *pCur){ 000785 assert( cursorHoldsMutex(pCur) ); 000786 sqlite3_free(pCur->pKey); 000787 pCur->pKey = 0; 000788 pCur->eState = CURSOR_INVALID; 000789 } 000790 000791 /* 000792 ** In this version of BtreeMoveto, pKey is a packed index record 000793 ** such as is generated by the OP_MakeRecord opcode. Unpack the 000794 ** record and then call BtreeMovetoUnpacked() to do the work. 000795 */ 000796 static int btreeMoveto( 000797 BtCursor *pCur, /* Cursor open on the btree to be searched */ 000798 const void *pKey, /* Packed key if the btree is an index */ 000799 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 000800 int bias, /* Bias search to the high end */ 000801 int *pRes /* Write search results here */ 000802 ){ 000803 int rc; /* Status code */ 000804 UnpackedRecord *pIdxKey; /* Unpacked index key */ 000805 000806 if( pKey ){ 000807 KeyInfo *pKeyInfo = pCur->pKeyInfo; 000808 assert( nKey==(i64)(int)nKey ); 000809 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo); 000810 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT; 000811 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey); 000812 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){ 000813 rc = SQLITE_CORRUPT_BKPT; 000814 goto moveto_done; 000815 } 000816 }else{ 000817 pIdxKey = 0; 000818 } 000819 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); 000820 moveto_done: 000821 if( pIdxKey ){ 000822 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); 000823 } 000824 return rc; 000825 } 000826 000827 /* 000828 ** Restore the cursor to the position it was in (or as close to as possible) 000829 ** when saveCursorPosition() was called. Note that this call deletes the 000830 ** saved position info stored by saveCursorPosition(), so there can be 000831 ** at most one effective restoreCursorPosition() call after each 000832 ** saveCursorPosition(). 000833 */ 000834 static int btreeRestoreCursorPosition(BtCursor *pCur){ 000835 int rc; 000836 int skipNext = 0; 000837 assert( cursorOwnsBtShared(pCur) ); 000838 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 000839 if( pCur->eState==CURSOR_FAULT ){ 000840 return pCur->skipNext; 000841 } 000842 pCur->eState = CURSOR_INVALID; 000843 if( sqlite3FaultSim(410) ){ 000844 rc = SQLITE_IOERR; 000845 }else{ 000846 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext); 000847 } 000848 if( rc==SQLITE_OK ){ 000849 sqlite3_free(pCur->pKey); 000850 pCur->pKey = 0; 000851 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 000852 if( skipNext ) pCur->skipNext = skipNext; 000853 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){ 000854 pCur->eState = CURSOR_SKIPNEXT; 000855 } 000856 } 000857 return rc; 000858 } 000859 000860 #define restoreCursorPosition(p) \ 000861 (p->eState>=CURSOR_REQUIRESEEK ? \ 000862 btreeRestoreCursorPosition(p) : \ 000863 SQLITE_OK) 000864 000865 /* 000866 ** Determine whether or not a cursor has moved from the position where 000867 ** it was last placed, or has been invalidated for any other reason. 000868 ** Cursors can move when the row they are pointing at is deleted out 000869 ** from under them, for example. Cursor might also move if a btree 000870 ** is rebalanced. 000871 ** 000872 ** Calling this routine with a NULL cursor pointer returns false. 000873 ** 000874 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor 000875 ** back to where it ought to be if this routine returns true. 000876 */ 000877 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){ 000878 assert( EIGHT_BYTE_ALIGNMENT(pCur) 000879 || pCur==sqlite3BtreeFakeValidCursor() ); 000880 assert( offsetof(BtCursor, eState)==0 ); 000881 assert( sizeof(pCur->eState)==1 ); 000882 return CURSOR_VALID != *(u8*)pCur; 000883 } 000884 000885 /* 000886 ** Return a pointer to a fake BtCursor object that will always answer 000887 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake 000888 ** cursor returned must not be used with any other Btree interface. 000889 */ 000890 BtCursor *sqlite3BtreeFakeValidCursor(void){ 000891 static u8 fakeCursor = CURSOR_VALID; 000892 assert( offsetof(BtCursor, eState)==0 ); 000893 return (BtCursor*)&fakeCursor; 000894 } 000895 000896 /* 000897 ** This routine restores a cursor back to its original position after it 000898 ** has been moved by some outside activity (such as a btree rebalance or 000899 ** a row having been deleted out from under the cursor). 000900 ** 000901 ** On success, the *pDifferentRow parameter is false if the cursor is left 000902 ** pointing at exactly the same row. *pDifferntRow is the row the cursor 000903 ** was pointing to has been deleted, forcing the cursor to point to some 000904 ** nearby row. 000905 ** 000906 ** This routine should only be called for a cursor that just returned 000907 ** TRUE from sqlite3BtreeCursorHasMoved(). 000908 */ 000909 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ 000910 int rc; 000911 000912 assert( pCur!=0 ); 000913 assert( pCur->eState!=CURSOR_VALID ); 000914 rc = restoreCursorPosition(pCur); 000915 if( rc ){ 000916 *pDifferentRow = 1; 000917 return rc; 000918 } 000919 if( pCur->eState!=CURSOR_VALID ){ 000920 *pDifferentRow = 1; 000921 }else{ 000922 *pDifferentRow = 0; 000923 } 000924 return SQLITE_OK; 000925 } 000926 000927 #ifdef SQLITE_ENABLE_CURSOR_HINTS 000928 /* 000929 ** Provide hints to the cursor. The particular hint given (and the type 000930 ** and number of the varargs parameters) is determined by the eHintType 000931 ** parameter. See the definitions of the BTREE_HINT_* macros for details. 000932 */ 000933 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){ 000934 /* Used only by system that substitute their own storage engine */ 000935 } 000936 #endif 000937 000938 /* 000939 ** Provide flag hints to the cursor. 000940 */ 000941 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){ 000942 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 ); 000943 pCur->hints = x; 000944 } 000945 000946 000947 #ifndef SQLITE_OMIT_AUTOVACUUM 000948 /* 000949 ** Given a page number of a regular database page, return the page 000950 ** number for the pointer-map page that contains the entry for the 000951 ** input page number. 000952 ** 000953 ** Return 0 (not a valid page) for pgno==1 since there is 000954 ** no pointer map associated with page 1. The integrity_check logic 000955 ** requires that ptrmapPageno(*,1)!=1. 000956 */ 000957 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 000958 int nPagesPerMapPage; 000959 Pgno iPtrMap, ret; 000960 assert( sqlite3_mutex_held(pBt->mutex) ); 000961 if( pgno<2 ) return 0; 000962 nPagesPerMapPage = (pBt->usableSize/5)+1; 000963 iPtrMap = (pgno-2)/nPagesPerMapPage; 000964 ret = (iPtrMap*nPagesPerMapPage) + 2; 000965 if( ret==PENDING_BYTE_PAGE(pBt) ){ 000966 ret++; 000967 } 000968 return ret; 000969 } 000970 000971 /* 000972 ** Write an entry into the pointer map. 000973 ** 000974 ** This routine updates the pointer map entry for page number 'key' 000975 ** so that it maps to type 'eType' and parent page number 'pgno'. 000976 ** 000977 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 000978 ** a no-op. If an error occurs, the appropriate error code is written 000979 ** into *pRC. 000980 */ 000981 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 000982 DbPage *pDbPage; /* The pointer map page */ 000983 u8 *pPtrmap; /* The pointer map data */ 000984 Pgno iPtrmap; /* The pointer map page number */ 000985 int offset; /* Offset in pointer map page */ 000986 int rc; /* Return code from subfunctions */ 000987 000988 if( *pRC ) return; 000989 000990 assert( sqlite3_mutex_held(pBt->mutex) ); 000991 /* The master-journal page number must never be used as a pointer map page */ 000992 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 000993 000994 assert( pBt->autoVacuum ); 000995 if( key==0 ){ 000996 *pRC = SQLITE_CORRUPT_BKPT; 000997 return; 000998 } 000999 iPtrmap = PTRMAP_PAGENO(pBt, key); 001000 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001001 if( rc!=SQLITE_OK ){ 001002 *pRC = rc; 001003 return; 001004 } 001005 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){ 001006 /* The first byte of the extra data is the MemPage.isInit byte. 001007 ** If that byte is set, it means this page is also being used 001008 ** as a btree page. */ 001009 *pRC = SQLITE_CORRUPT_BKPT; 001010 goto ptrmap_exit; 001011 } 001012 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001013 if( offset<0 ){ 001014 *pRC = SQLITE_CORRUPT_BKPT; 001015 goto ptrmap_exit; 001016 } 001017 assert( offset <= (int)pBt->usableSize-5 ); 001018 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001019 001020 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 001021 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 001022 *pRC= rc = sqlite3PagerWrite(pDbPage); 001023 if( rc==SQLITE_OK ){ 001024 pPtrmap[offset] = eType; 001025 put4byte(&pPtrmap[offset+1], parent); 001026 } 001027 } 001028 001029 ptrmap_exit: 001030 sqlite3PagerUnref(pDbPage); 001031 } 001032 001033 /* 001034 ** Read an entry from the pointer map. 001035 ** 001036 ** This routine retrieves the pointer map entry for page 'key', writing 001037 ** the type and parent page number to *pEType and *pPgno respectively. 001038 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 001039 */ 001040 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 001041 DbPage *pDbPage; /* The pointer map page */ 001042 int iPtrmap; /* Pointer map page index */ 001043 u8 *pPtrmap; /* Pointer map page data */ 001044 int offset; /* Offset of entry in pointer map */ 001045 int rc; 001046 001047 assert( sqlite3_mutex_held(pBt->mutex) ); 001048 001049 iPtrmap = PTRMAP_PAGENO(pBt, key); 001050 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001051 if( rc!=0 ){ 001052 return rc; 001053 } 001054 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001055 001056 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001057 if( offset<0 ){ 001058 sqlite3PagerUnref(pDbPage); 001059 return SQLITE_CORRUPT_BKPT; 001060 } 001061 assert( offset <= (int)pBt->usableSize-5 ); 001062 assert( pEType!=0 ); 001063 *pEType = pPtrmap[offset]; 001064 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 001065 001066 sqlite3PagerUnref(pDbPage); 001067 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap); 001068 return SQLITE_OK; 001069 } 001070 001071 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 001072 #define ptrmapPut(w,x,y,z,rc) 001073 #define ptrmapGet(w,x,y,z) SQLITE_OK 001074 #define ptrmapPutOvflPtr(x, y, z, rc) 001075 #endif 001076 001077 /* 001078 ** Given a btree page and a cell index (0 means the first cell on 001079 ** the page, 1 means the second cell, and so forth) return a pointer 001080 ** to the cell content. 001081 ** 001082 ** findCellPastPtr() does the same except it skips past the initial 001083 ** 4-byte child pointer found on interior pages, if there is one. 001084 ** 001085 ** This routine works only for pages that do not contain overflow cells. 001086 */ 001087 #define findCell(P,I) \ 001088 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001089 #define findCellPastPtr(P,I) \ 001090 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001091 001092 001093 /* 001094 ** This is common tail processing for btreeParseCellPtr() and 001095 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely 001096 ** on a single B-tree page. Make necessary adjustments to the CellInfo 001097 ** structure. 001098 */ 001099 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( 001100 MemPage *pPage, /* Page containing the cell */ 001101 u8 *pCell, /* Pointer to the cell text. */ 001102 CellInfo *pInfo /* Fill in this structure */ 001103 ){ 001104 /* If the payload will not fit completely on the local page, we have 001105 ** to decide how much to store locally and how much to spill onto 001106 ** overflow pages. The strategy is to minimize the amount of unused 001107 ** space on overflow pages while keeping the amount of local storage 001108 ** in between minLocal and maxLocal. 001109 ** 001110 ** Warning: changing the way overflow payload is distributed in any 001111 ** way will result in an incompatible file format. 001112 */ 001113 int minLocal; /* Minimum amount of payload held locally */ 001114 int maxLocal; /* Maximum amount of payload held locally */ 001115 int surplus; /* Overflow payload available for local storage */ 001116 001117 minLocal = pPage->minLocal; 001118 maxLocal = pPage->maxLocal; 001119 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4); 001120 testcase( surplus==maxLocal ); 001121 testcase( surplus==maxLocal+1 ); 001122 if( surplus <= maxLocal ){ 001123 pInfo->nLocal = (u16)surplus; 001124 }else{ 001125 pInfo->nLocal = (u16)minLocal; 001126 } 001127 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4; 001128 } 001129 001130 /* 001131 ** The following routines are implementations of the MemPage.xParseCell() 001132 ** method. 001133 ** 001134 ** Parse a cell content block and fill in the CellInfo structure. 001135 ** 001136 ** btreeParseCellPtr() => table btree leaf nodes 001137 ** btreeParseCellNoPayload() => table btree internal nodes 001138 ** btreeParseCellPtrIndex() => index btree nodes 001139 ** 001140 ** There is also a wrapper function btreeParseCell() that works for 001141 ** all MemPage types and that references the cell by index rather than 001142 ** by pointer. 001143 */ 001144 static void btreeParseCellPtrNoPayload( 001145 MemPage *pPage, /* Page containing the cell */ 001146 u8 *pCell, /* Pointer to the cell text. */ 001147 CellInfo *pInfo /* Fill in this structure */ 001148 ){ 001149 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001150 assert( pPage->leaf==0 ); 001151 assert( pPage->childPtrSize==4 ); 001152 #ifndef SQLITE_DEBUG 001153 UNUSED_PARAMETER(pPage); 001154 #endif 001155 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); 001156 pInfo->nPayload = 0; 001157 pInfo->nLocal = 0; 001158 pInfo->pPayload = 0; 001159 return; 001160 } 001161 static void btreeParseCellPtr( 001162 MemPage *pPage, /* Page containing the cell */ 001163 u8 *pCell, /* Pointer to the cell text. */ 001164 CellInfo *pInfo /* Fill in this structure */ 001165 ){ 001166 u8 *pIter; /* For scanning through pCell */ 001167 u32 nPayload; /* Number of bytes of cell payload */ 001168 u64 iKey; /* Extracted Key value */ 001169 001170 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001171 assert( pPage->leaf==0 || pPage->leaf==1 ); 001172 assert( pPage->intKeyLeaf ); 001173 assert( pPage->childPtrSize==0 ); 001174 pIter = pCell; 001175 001176 /* The next block of code is equivalent to: 001177 ** 001178 ** pIter += getVarint32(pIter, nPayload); 001179 ** 001180 ** The code is inlined to avoid a function call. 001181 */ 001182 nPayload = *pIter; 001183 if( nPayload>=0x80 ){ 001184 u8 *pEnd = &pIter[8]; 001185 nPayload &= 0x7f; 001186 do{ 001187 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001188 }while( (*pIter)>=0x80 && pIter<pEnd ); 001189 } 001190 pIter++; 001191 001192 /* The next block of code is equivalent to: 001193 ** 001194 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); 001195 ** 001196 ** The code is inlined to avoid a function call. 001197 */ 001198 iKey = *pIter; 001199 if( iKey>=0x80 ){ 001200 u8 *pEnd = &pIter[7]; 001201 iKey &= 0x7f; 001202 while(1){ 001203 iKey = (iKey<<7) | (*++pIter & 0x7f); 001204 if( (*pIter)<0x80 ) break; 001205 if( pIter>=pEnd ){ 001206 iKey = (iKey<<8) | *++pIter; 001207 break; 001208 } 001209 } 001210 } 001211 pIter++; 001212 001213 pInfo->nKey = *(i64*)&iKey; 001214 pInfo->nPayload = nPayload; 001215 pInfo->pPayload = pIter; 001216 testcase( nPayload==pPage->maxLocal ); 001217 testcase( nPayload==pPage->maxLocal+1 ); 001218 if( nPayload<=pPage->maxLocal ){ 001219 /* This is the (easy) common case where the entire payload fits 001220 ** on the local page. No overflow is required. 001221 */ 001222 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001223 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001224 pInfo->nLocal = (u16)nPayload; 001225 }else{ 001226 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001227 } 001228 } 001229 static void btreeParseCellPtrIndex( 001230 MemPage *pPage, /* Page containing the cell */ 001231 u8 *pCell, /* Pointer to the cell text. */ 001232 CellInfo *pInfo /* Fill in this structure */ 001233 ){ 001234 u8 *pIter; /* For scanning through pCell */ 001235 u32 nPayload; /* Number of bytes of cell payload */ 001236 001237 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001238 assert( pPage->leaf==0 || pPage->leaf==1 ); 001239 assert( pPage->intKeyLeaf==0 ); 001240 pIter = pCell + pPage->childPtrSize; 001241 nPayload = *pIter; 001242 if( nPayload>=0x80 ){ 001243 u8 *pEnd = &pIter[8]; 001244 nPayload &= 0x7f; 001245 do{ 001246 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001247 }while( *(pIter)>=0x80 && pIter<pEnd ); 001248 } 001249 pIter++; 001250 pInfo->nKey = nPayload; 001251 pInfo->nPayload = nPayload; 001252 pInfo->pPayload = pIter; 001253 testcase( nPayload==pPage->maxLocal ); 001254 testcase( nPayload==pPage->maxLocal+1 ); 001255 if( nPayload<=pPage->maxLocal ){ 001256 /* This is the (easy) common case where the entire payload fits 001257 ** on the local page. No overflow is required. 001258 */ 001259 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001260 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001261 pInfo->nLocal = (u16)nPayload; 001262 }else{ 001263 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001264 } 001265 } 001266 static void btreeParseCell( 001267 MemPage *pPage, /* Page containing the cell */ 001268 int iCell, /* The cell index. First cell is 0 */ 001269 CellInfo *pInfo /* Fill in this structure */ 001270 ){ 001271 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo); 001272 } 001273 001274 /* 001275 ** The following routines are implementations of the MemPage.xCellSize 001276 ** method. 001277 ** 001278 ** Compute the total number of bytes that a Cell needs in the cell 001279 ** data area of the btree-page. The return number includes the cell 001280 ** data header and the local payload, but not any overflow page or 001281 ** the space used by the cell pointer. 001282 ** 001283 ** cellSizePtrNoPayload() => table internal nodes 001284 ** cellSizePtr() => all index nodes & table leaf nodes 001285 */ 001286 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 001287 u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */ 001288 u8 *pEnd; /* End mark for a varint */ 001289 u32 nSize; /* Size value to return */ 001290 001291 #ifdef SQLITE_DEBUG 001292 /* The value returned by this function should always be the same as 001293 ** the (CellInfo.nSize) value found by doing a full parse of the 001294 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001295 ** this function verifies that this invariant is not violated. */ 001296 CellInfo debuginfo; 001297 pPage->xParseCell(pPage, pCell, &debuginfo); 001298 #endif 001299 001300 nSize = *pIter; 001301 if( nSize>=0x80 ){ 001302 pEnd = &pIter[8]; 001303 nSize &= 0x7f; 001304 do{ 001305 nSize = (nSize<<7) | (*++pIter & 0x7f); 001306 }while( *(pIter)>=0x80 && pIter<pEnd ); 001307 } 001308 pIter++; 001309 if( pPage->intKey ){ 001310 /* pIter now points at the 64-bit integer key value, a variable length 001311 ** integer. The following block moves pIter to point at the first byte 001312 ** past the end of the key value. */ 001313 pEnd = &pIter[9]; 001314 while( (*pIter++)&0x80 && pIter<pEnd ); 001315 } 001316 testcase( nSize==pPage->maxLocal ); 001317 testcase( nSize==pPage->maxLocal+1 ); 001318 if( nSize<=pPage->maxLocal ){ 001319 nSize += (u32)(pIter - pCell); 001320 if( nSize<4 ) nSize = 4; 001321 }else{ 001322 int minLocal = pPage->minLocal; 001323 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001324 testcase( nSize==pPage->maxLocal ); 001325 testcase( nSize==pPage->maxLocal+1 ); 001326 if( nSize>pPage->maxLocal ){ 001327 nSize = minLocal; 001328 } 001329 nSize += 4 + (u16)(pIter - pCell); 001330 } 001331 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001332 return (u16)nSize; 001333 } 001334 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ 001335 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001336 u8 *pEnd; /* End mark for a varint */ 001337 001338 #ifdef SQLITE_DEBUG 001339 /* The value returned by this function should always be the same as 001340 ** the (CellInfo.nSize) value found by doing a full parse of the 001341 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001342 ** this function verifies that this invariant is not violated. */ 001343 CellInfo debuginfo; 001344 pPage->xParseCell(pPage, pCell, &debuginfo); 001345 #else 001346 UNUSED_PARAMETER(pPage); 001347 #endif 001348 001349 assert( pPage->childPtrSize==4 ); 001350 pEnd = pIter + 9; 001351 while( (*pIter++)&0x80 && pIter<pEnd ); 001352 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); 001353 return (u16)(pIter - pCell); 001354 } 001355 001356 001357 #ifdef SQLITE_DEBUG 001358 /* This variation on cellSizePtr() is used inside of assert() statements 001359 ** only. */ 001360 static u16 cellSize(MemPage *pPage, int iCell){ 001361 return pPage->xCellSize(pPage, findCell(pPage, iCell)); 001362 } 001363 #endif 001364 001365 #ifndef SQLITE_OMIT_AUTOVACUUM 001366 /* 001367 ** The cell pCell is currently part of page pSrc but will ultimately be part 001368 ** of pPage. (pSrc and pPager are often the same.) If pCell contains a 001369 ** pointer to an overflow page, insert an entry into the pointer-map for 001370 ** the overflow page that will be valid after pCell has been moved to pPage. 001371 */ 001372 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){ 001373 CellInfo info; 001374 if( *pRC ) return; 001375 assert( pCell!=0 ); 001376 pPage->xParseCell(pPage, pCell, &info); 001377 if( info.nLocal<info.nPayload ){ 001378 Pgno ovfl; 001379 if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ 001380 testcase( pSrc!=pPage ); 001381 *pRC = SQLITE_CORRUPT_BKPT; 001382 return; 001383 } 001384 ovfl = get4byte(&pCell[info.nSize-4]); 001385 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 001386 } 001387 } 001388 #endif 001389 001390 001391 /* 001392 ** Defragment the page given. This routine reorganizes cells within the 001393 ** page so that there are no free-blocks on the free-block list. 001394 ** 001395 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be 001396 ** present in the page after this routine returns. 001397 ** 001398 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a 001399 ** b-tree page so that there are no freeblocks or fragment bytes, all 001400 ** unused bytes are contained in the unallocated space region, and all 001401 ** cells are packed tightly at the end of the page. 001402 */ 001403 static int defragmentPage(MemPage *pPage, int nMaxFrag){ 001404 int i; /* Loop counter */ 001405 int pc; /* Address of the i-th cell */ 001406 int hdr; /* Offset to the page header */ 001407 int size; /* Size of a cell */ 001408 int usableSize; /* Number of usable bytes on a page */ 001409 int cellOffset; /* Offset to the cell pointer array */ 001410 int cbrk; /* Offset to the cell content area */ 001411 int nCell; /* Number of cells on the page */ 001412 unsigned char *data; /* The page data */ 001413 unsigned char *temp; /* Temp area for cell content */ 001414 unsigned char *src; /* Source of content */ 001415 int iCellFirst; /* First allowable cell index */ 001416 int iCellLast; /* Last possible cell index */ 001417 001418 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001419 assert( pPage->pBt!=0 ); 001420 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 001421 assert( pPage->nOverflow==0 ); 001422 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001423 temp = 0; 001424 src = data = pPage->aData; 001425 hdr = pPage->hdrOffset; 001426 cellOffset = pPage->cellOffset; 001427 nCell = pPage->nCell; 001428 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB ); 001429 iCellFirst = cellOffset + 2*nCell; 001430 usableSize = pPage->pBt->usableSize; 001431 001432 /* This block handles pages with two or fewer free blocks and nMaxFrag 001433 ** or fewer fragmented bytes. In this case it is faster to move the 001434 ** two (or one) blocks of cells using memmove() and add the required 001435 ** offsets to each pointer in the cell-pointer array than it is to 001436 ** reconstruct the entire page. */ 001437 if( (int)data[hdr+7]<=nMaxFrag ){ 001438 int iFree = get2byte(&data[hdr+1]); 001439 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001440 if( iFree ){ 001441 int iFree2 = get2byte(&data[iFree]); 001442 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001443 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){ 001444 u8 *pEnd = &data[cellOffset + nCell*2]; 001445 u8 *pAddr; 001446 int sz2 = 0; 001447 int sz = get2byte(&data[iFree+2]); 001448 int top = get2byte(&data[hdr+5]); 001449 if( top>=iFree ){ 001450 return SQLITE_CORRUPT_PAGE(pPage); 001451 } 001452 if( iFree2 ){ 001453 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage); 001454 sz2 = get2byte(&data[iFree2+2]); 001455 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); 001456 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); 001457 sz += sz2; 001458 }else if( iFree+sz>usableSize ){ 001459 return SQLITE_CORRUPT_PAGE(pPage); 001460 } 001461 001462 cbrk = top+sz; 001463 assert( cbrk+(iFree-top) <= usableSize ); 001464 memmove(&data[cbrk], &data[top], iFree-top); 001465 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){ 001466 pc = get2byte(pAddr); 001467 if( pc<iFree ){ put2byte(pAddr, pc+sz); } 001468 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); } 001469 } 001470 goto defragment_out; 001471 } 001472 } 001473 } 001474 001475 cbrk = usableSize; 001476 iCellLast = usableSize - 4; 001477 for(i=0; i<nCell; i++){ 001478 u8 *pAddr; /* The i-th cell pointer */ 001479 pAddr = &data[cellOffset + i*2]; 001480 pc = get2byte(pAddr); 001481 testcase( pc==iCellFirst ); 001482 testcase( pc==iCellLast ); 001483 /* These conditions have already been verified in btreeInitPage() 001484 ** if PRAGMA cell_size_check=ON. 001485 */ 001486 if( pc<iCellFirst || pc>iCellLast ){ 001487 return SQLITE_CORRUPT_PAGE(pPage); 001488 } 001489 assert( pc>=iCellFirst && pc<=iCellLast ); 001490 size = pPage->xCellSize(pPage, &src[pc]); 001491 cbrk -= size; 001492 if( cbrk<iCellFirst || pc+size>usableSize ){ 001493 return SQLITE_CORRUPT_PAGE(pPage); 001494 } 001495 assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); 001496 testcase( cbrk+size==usableSize ); 001497 testcase( pc+size==usableSize ); 001498 put2byte(pAddr, cbrk); 001499 if( temp==0 ){ 001500 int x; 001501 if( cbrk==pc ) continue; 001502 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 001503 x = get2byte(&data[hdr+5]); 001504 memcpy(&temp[x], &data[x], (cbrk+size) - x); 001505 src = temp; 001506 } 001507 memcpy(&data[cbrk], &src[pc], size); 001508 } 001509 data[hdr+7] = 0; 001510 001511 defragment_out: 001512 assert( pPage->nFree>=0 ); 001513 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ 001514 return SQLITE_CORRUPT_PAGE(pPage); 001515 } 001516 assert( cbrk>=iCellFirst ); 001517 put2byte(&data[hdr+5], cbrk); 001518 data[hdr+1] = 0; 001519 data[hdr+2] = 0; 001520 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 001521 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001522 return SQLITE_OK; 001523 } 001524 001525 /* 001526 ** Search the free-list on page pPg for space to store a cell nByte bytes in 001527 ** size. If one can be found, return a pointer to the space and remove it 001528 ** from the free-list. 001529 ** 001530 ** If no suitable space can be found on the free-list, return NULL. 001531 ** 001532 ** This function may detect corruption within pPg. If corruption is 001533 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. 001534 ** 001535 ** Slots on the free list that are between 1 and 3 bytes larger than nByte 001536 ** will be ignored if adding the extra space to the fragmentation count 001537 ** causes the fragmentation count to exceed 60. 001538 */ 001539 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ 001540 const int hdr = pPg->hdrOffset; /* Offset to page header */ 001541 u8 * const aData = pPg->aData; /* Page data */ 001542 int iAddr = hdr + 1; /* Address of ptr to pc */ 001543 int pc = get2byte(&aData[iAddr]); /* Address of a free slot */ 001544 int x; /* Excess size of the slot */ 001545 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ 001546 int size; /* Size of the free slot */ 001547 001548 assert( pc>0 ); 001549 while( pc<=maxPC ){ 001550 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each 001551 ** freeblock form a big-endian integer which is the size of the freeblock 001552 ** in bytes, including the 4-byte header. */ 001553 size = get2byte(&aData[pc+2]); 001554 if( (x = size - nByte)>=0 ){ 001555 testcase( x==4 ); 001556 testcase( x==3 ); 001557 if( x<4 ){ 001558 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total 001559 ** number of bytes in fragments may not exceed 60. */ 001560 if( aData[hdr+7]>57 ) return 0; 001561 001562 /* Remove the slot from the free-list. Update the number of 001563 ** fragmented bytes within the page. */ 001564 memcpy(&aData[iAddr], &aData[pc], 2); 001565 aData[hdr+7] += (u8)x; 001566 }else if( x+pc > maxPC ){ 001567 /* This slot extends off the end of the usable part of the page */ 001568 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001569 return 0; 001570 }else{ 001571 /* The slot remains on the free-list. Reduce its size to account 001572 ** for the portion used by the new allocation. */ 001573 put2byte(&aData[pc+2], x); 001574 } 001575 return &aData[pc + x]; 001576 } 001577 iAddr = pc; 001578 pc = get2byte(&aData[pc]); 001579 if( pc<=iAddr+size ){ 001580 if( pc ){ 001581 /* The next slot in the chain is not past the end of the current slot */ 001582 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001583 } 001584 return 0; 001585 } 001586 } 001587 if( pc>maxPC+nByte-4 ){ 001588 /* The free slot chain extends off the end of the page */ 001589 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001590 } 001591 return 0; 001592 } 001593 001594 /* 001595 ** Allocate nByte bytes of space from within the B-Tree page passed 001596 ** as the first argument. Write into *pIdx the index into pPage->aData[] 001597 ** of the first byte of allocated space. Return either SQLITE_OK or 001598 ** an error code (usually SQLITE_CORRUPT). 001599 ** 001600 ** The caller guarantees that there is sufficient space to make the 001601 ** allocation. This routine might need to defragment in order to bring 001602 ** all the space together, however. This routine will avoid using 001603 ** the first two bytes past the cell pointer area since presumably this 001604 ** allocation is being made in order to insert a new cell, so we will 001605 ** also end up needing a new cell pointer. 001606 */ 001607 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 001608 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 001609 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 001610 int top; /* First byte of cell content area */ 001611 int rc = SQLITE_OK; /* Integer return code */ 001612 int gap; /* First byte of gap between cell pointers and cell content */ 001613 001614 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001615 assert( pPage->pBt ); 001616 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001617 assert( nByte>=0 ); /* Minimum cell size is 4 */ 001618 assert( pPage->nFree>=nByte ); 001619 assert( pPage->nOverflow==0 ); 001620 assert( nByte < (int)(pPage->pBt->usableSize-8) ); 001621 001622 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 001623 gap = pPage->cellOffset + 2*pPage->nCell; 001624 assert( gap<=65536 ); 001625 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size 001626 ** and the reserved space is zero (the usual value for reserved space) 001627 ** then the cell content offset of an empty page wants to be 65536. 001628 ** However, that integer is too large to be stored in a 2-byte unsigned 001629 ** integer, so a value of 0 is used in its place. */ 001630 top = get2byte(&data[hdr+5]); 001631 assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */ 001632 if( gap>top ){ 001633 if( top==0 && pPage->pBt->usableSize==65536 ){ 001634 top = 65536; 001635 }else{ 001636 return SQLITE_CORRUPT_PAGE(pPage); 001637 } 001638 } 001639 001640 /* If there is enough space between gap and top for one more cell pointer, 001641 ** and if the freelist is not empty, then search the 001642 ** freelist looking for a slot big enough to satisfy the request. 001643 */ 001644 testcase( gap+2==top ); 001645 testcase( gap+1==top ); 001646 testcase( gap==top ); 001647 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){ 001648 u8 *pSpace = pageFindSlot(pPage, nByte, &rc); 001649 if( pSpace ){ 001650 assert( pSpace+nByte<=data+pPage->pBt->usableSize ); 001651 if( (*pIdx = (int)(pSpace-data))<=gap ){ 001652 return SQLITE_CORRUPT_PAGE(pPage); 001653 }else{ 001654 return SQLITE_OK; 001655 } 001656 }else if( rc ){ 001657 return rc; 001658 } 001659 } 001660 001661 /* The request could not be fulfilled using a freelist slot. Check 001662 ** to see if defragmentation is necessary. 001663 */ 001664 testcase( gap+2+nByte==top ); 001665 if( gap+2+nByte>top ){ 001666 assert( pPage->nCell>0 || CORRUPT_DB ); 001667 assert( pPage->nFree>=0 ); 001668 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte))); 001669 if( rc ) return rc; 001670 top = get2byteNotZero(&data[hdr+5]); 001671 assert( gap+2+nByte<=top ); 001672 } 001673 001674 001675 /* Allocate memory from the gap in between the cell pointer array 001676 ** and the cell content area. The btreeComputeFreeSpace() call has already 001677 ** validated the freelist. Given that the freelist is valid, there 001678 ** is no way that the allocation can extend off the end of the page. 001679 ** The assert() below verifies the previous sentence. 001680 */ 001681 top -= nByte; 001682 put2byte(&data[hdr+5], top); 001683 assert( top+nByte <= (int)pPage->pBt->usableSize ); 001684 *pIdx = top; 001685 return SQLITE_OK; 001686 } 001687 001688 /* 001689 ** Return a section of the pPage->aData to the freelist. 001690 ** The first byte of the new free block is pPage->aData[iStart] 001691 ** and the size of the block is iSize bytes. 001692 ** 001693 ** Adjacent freeblocks are coalesced. 001694 ** 001695 ** Even though the freeblock list was checked by btreeComputeFreeSpace(), 001696 ** that routine will not detect overlap between cells or freeblocks. Nor 001697 ** does it detect cells or freeblocks that encrouch into the reserved bytes 001698 ** at the end of the page. So do additional corruption checks inside this 001699 ** routine and return SQLITE_CORRUPT if any problems are found. 001700 */ 001701 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ 001702 u16 iPtr; /* Address of ptr to next freeblock */ 001703 u16 iFreeBlk; /* Address of the next freeblock */ 001704 u8 hdr; /* Page header size. 0 or 100 */ 001705 u8 nFrag = 0; /* Reduction in fragmentation */ 001706 u16 iOrigSize = iSize; /* Original value of iSize */ 001707 u16 x; /* Offset to cell content area */ 001708 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ 001709 unsigned char *data = pPage->aData; /* Page content */ 001710 001711 assert( pPage->pBt!=0 ); 001712 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001713 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); 001714 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); 001715 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001716 assert( iSize>=4 ); /* Minimum cell size is 4 */ 001717 assert( iStart<=pPage->pBt->usableSize-4 ); 001718 001719 /* The list of freeblocks must be in ascending order. Find the 001720 ** spot on the list where iStart should be inserted. 001721 */ 001722 hdr = pPage->hdrOffset; 001723 iPtr = hdr + 1; 001724 if( data[iPtr+1]==0 && data[iPtr]==0 ){ 001725 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ 001726 }else{ 001727 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){ 001728 if( iFreeBlk<iPtr+4 ){ 001729 if( iFreeBlk==0 ) break; 001730 return SQLITE_CORRUPT_PAGE(pPage); 001731 } 001732 iPtr = iFreeBlk; 001733 } 001734 if( iFreeBlk>pPage->pBt->usableSize-4 ){ 001735 return SQLITE_CORRUPT_PAGE(pPage); 001736 } 001737 assert( iFreeBlk>iPtr || iFreeBlk==0 ); 001738 001739 /* At this point: 001740 ** iFreeBlk: First freeblock after iStart, or zero if none 001741 ** iPtr: The address of a pointer to iFreeBlk 001742 ** 001743 ** Check to see if iFreeBlk should be coalesced onto the end of iStart. 001744 */ 001745 if( iFreeBlk && iEnd+3>=iFreeBlk ){ 001746 nFrag = iFreeBlk - iEnd; 001747 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage); 001748 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); 001749 if( iEnd > pPage->pBt->usableSize ){ 001750 return SQLITE_CORRUPT_PAGE(pPage); 001751 } 001752 iSize = iEnd - iStart; 001753 iFreeBlk = get2byte(&data[iFreeBlk]); 001754 } 001755 001756 /* If iPtr is another freeblock (that is, if iPtr is not the freelist 001757 ** pointer in the page header) then check to see if iStart should be 001758 ** coalesced onto the end of iPtr. 001759 */ 001760 if( iPtr>hdr+1 ){ 001761 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]); 001762 if( iPtrEnd+3>=iStart ){ 001763 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage); 001764 nFrag += iStart - iPtrEnd; 001765 iSize = iEnd - iPtr; 001766 iStart = iPtr; 001767 } 001768 } 001769 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); 001770 data[hdr+7] -= nFrag; 001771 } 001772 x = get2byte(&data[hdr+5]); 001773 if( iStart<=x ){ 001774 /* The new freeblock is at the beginning of the cell content area, 001775 ** so just extend the cell content area rather than create another 001776 ** freelist entry */ 001777 if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage); 001778 put2byte(&data[hdr+1], iFreeBlk); 001779 put2byte(&data[hdr+5], iEnd); 001780 }else{ 001781 /* Insert the new freeblock into the freelist */ 001782 put2byte(&data[iPtr], iStart); 001783 } 001784 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ 001785 /* Overwrite deleted information with zeros when the secure_delete 001786 ** option is enabled */ 001787 memset(&data[iStart], 0, iSize); 001788 } 001789 put2byte(&data[iStart], iFreeBlk); 001790 put2byte(&data[iStart+2], iSize); 001791 pPage->nFree += iOrigSize; 001792 return SQLITE_OK; 001793 } 001794 001795 /* 001796 ** Decode the flags byte (the first byte of the header) for a page 001797 ** and initialize fields of the MemPage structure accordingly. 001798 ** 001799 ** Only the following combinations are supported. Anything different 001800 ** indicates a corrupt database files: 001801 ** 001802 ** PTF_ZERODATA 001803 ** PTF_ZERODATA | PTF_LEAF 001804 ** PTF_LEAFDATA | PTF_INTKEY 001805 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF 001806 */ 001807 static int decodeFlags(MemPage *pPage, int flagByte){ 001808 BtShared *pBt; /* A copy of pPage->pBt */ 001809 001810 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 001811 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001812 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); 001813 flagByte &= ~PTF_LEAF; 001814 pPage->childPtrSize = 4-4*pPage->leaf; 001815 pPage->xCellSize = cellSizePtr; 001816 pBt = pPage->pBt; 001817 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 001818 /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an 001819 ** interior table b-tree page. */ 001820 assert( (PTF_LEAFDATA|PTF_INTKEY)==5 ); 001821 /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a 001822 ** leaf table b-tree page. */ 001823 assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 ); 001824 pPage->intKey = 1; 001825 if( pPage->leaf ){ 001826 pPage->intKeyLeaf = 1; 001827 pPage->xParseCell = btreeParseCellPtr; 001828 }else{ 001829 pPage->intKeyLeaf = 0; 001830 pPage->xCellSize = cellSizePtrNoPayload; 001831 pPage->xParseCell = btreeParseCellPtrNoPayload; 001832 } 001833 pPage->maxLocal = pBt->maxLeaf; 001834 pPage->minLocal = pBt->minLeaf; 001835 }else if( flagByte==PTF_ZERODATA ){ 001836 /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an 001837 ** interior index b-tree page. */ 001838 assert( (PTF_ZERODATA)==2 ); 001839 /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a 001840 ** leaf index b-tree page. */ 001841 assert( (PTF_ZERODATA|PTF_LEAF)==10 ); 001842 pPage->intKey = 0; 001843 pPage->intKeyLeaf = 0; 001844 pPage->xParseCell = btreeParseCellPtrIndex; 001845 pPage->maxLocal = pBt->maxLocal; 001846 pPage->minLocal = pBt->minLocal; 001847 }else{ 001848 /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is 001849 ** an error. */ 001850 return SQLITE_CORRUPT_PAGE(pPage); 001851 } 001852 pPage->max1bytePayload = pBt->max1bytePayload; 001853 return SQLITE_OK; 001854 } 001855 001856 /* 001857 ** Compute the amount of freespace on the page. In other words, fill 001858 ** in the pPage->nFree field. 001859 */ 001860 static int btreeComputeFreeSpace(MemPage *pPage){ 001861 int pc; /* Address of a freeblock within pPage->aData[] */ 001862 u8 hdr; /* Offset to beginning of page header */ 001863 u8 *data; /* Equal to pPage->aData */ 001864 int usableSize; /* Amount of usable space on each page */ 001865 int nFree; /* Number of unused bytes on the page */ 001866 int top; /* First byte of the cell content area */ 001867 int iCellFirst; /* First allowable cell or freeblock offset */ 001868 int iCellLast; /* Last possible cell or freeblock offset */ 001869 001870 assert( pPage->pBt!=0 ); 001871 assert( pPage->pBt->db!=0 ); 001872 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001873 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 001874 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 001875 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 001876 assert( pPage->isInit==1 ); 001877 assert( pPage->nFree<0 ); 001878 001879 usableSize = pPage->pBt->usableSize; 001880 hdr = pPage->hdrOffset; 001881 data = pPage->aData; 001882 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates 001883 ** the start of the cell content area. A zero value for this integer is 001884 ** interpreted as 65536. */ 001885 top = get2byteNotZero(&data[hdr+5]); 001886 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell; 001887 iCellLast = usableSize - 4; 001888 001889 /* Compute the total free space on the page 001890 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the 001891 ** start of the first freeblock on the page, or is zero if there are no 001892 ** freeblocks. */ 001893 pc = get2byte(&data[hdr+1]); 001894 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ 001895 if( pc>0 ){ 001896 u32 next, size; 001897 if( pc<iCellFirst ){ 001898 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will 001899 ** always be at least one cell before the first freeblock. 001900 */ 001901 return SQLITE_CORRUPT_PAGE(pPage); 001902 } 001903 while( 1 ){ 001904 if( pc>iCellLast ){ 001905 /* Freeblock off the end of the page */ 001906 return SQLITE_CORRUPT_PAGE(pPage); 001907 } 001908 next = get2byte(&data[pc]); 001909 size = get2byte(&data[pc+2]); 001910 nFree = nFree + size; 001911 if( next<=pc+size+3 ) break; 001912 pc = next; 001913 } 001914 if( next>0 ){ 001915 /* Freeblock not in ascending order */ 001916 return SQLITE_CORRUPT_PAGE(pPage); 001917 } 001918 if( pc+size>(unsigned int)usableSize ){ 001919 /* Last freeblock extends past page end */ 001920 return SQLITE_CORRUPT_PAGE(pPage); 001921 } 001922 } 001923 001924 /* At this point, nFree contains the sum of the offset to the start 001925 ** of the cell-content area plus the number of free bytes within 001926 ** the cell-content area. If this is greater than the usable-size 001927 ** of the page, then the page must be corrupted. This check also 001928 ** serves to verify that the offset to the start of the cell-content 001929 ** area, according to the page header, lies within the page. 001930 */ 001931 if( nFree>usableSize || nFree<iCellFirst ){ 001932 return SQLITE_CORRUPT_PAGE(pPage); 001933 } 001934 pPage->nFree = (u16)(nFree - iCellFirst); 001935 return SQLITE_OK; 001936 } 001937 001938 /* 001939 ** Do additional sanity check after btreeInitPage() if 001940 ** PRAGMA cell_size_check=ON 001941 */ 001942 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ 001943 int iCellFirst; /* First allowable cell or freeblock offset */ 001944 int iCellLast; /* Last possible cell or freeblock offset */ 001945 int i; /* Index into the cell pointer array */ 001946 int sz; /* Size of a cell */ 001947 int pc; /* Address of a freeblock within pPage->aData[] */ 001948 u8 *data; /* Equal to pPage->aData */ 001949 int usableSize; /* Maximum usable space on the page */ 001950 int cellOffset; /* Start of cell content area */ 001951 001952 iCellFirst = pPage->cellOffset + 2*pPage->nCell; 001953 usableSize = pPage->pBt->usableSize; 001954 iCellLast = usableSize - 4; 001955 data = pPage->aData; 001956 cellOffset = pPage->cellOffset; 001957 if( !pPage->leaf ) iCellLast--; 001958 for(i=0; i<pPage->nCell; i++){ 001959 pc = get2byteAligned(&data[cellOffset+i*2]); 001960 testcase( pc==iCellFirst ); 001961 testcase( pc==iCellLast ); 001962 if( pc<iCellFirst || pc>iCellLast ){ 001963 return SQLITE_CORRUPT_PAGE(pPage); 001964 } 001965 sz = pPage->xCellSize(pPage, &data[pc]); 001966 testcase( pc+sz==usableSize ); 001967 if( pc+sz>usableSize ){ 001968 return SQLITE_CORRUPT_PAGE(pPage); 001969 } 001970 } 001971 return SQLITE_OK; 001972 } 001973 001974 /* 001975 ** Initialize the auxiliary information for a disk block. 001976 ** 001977 ** Return SQLITE_OK on success. If we see that the page does 001978 ** not contain a well-formed database page, then return 001979 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 001980 ** guarantee that the page is well-formed. It only shows that 001981 ** we failed to detect any corruption. 001982 */ 001983 static int btreeInitPage(MemPage *pPage){ 001984 u8 *data; /* Equal to pPage->aData */ 001985 BtShared *pBt; /* The main btree structure */ 001986 001987 assert( pPage->pBt!=0 ); 001988 assert( pPage->pBt->db!=0 ); 001989 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001990 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 001991 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 001992 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 001993 assert( pPage->isInit==0 ); 001994 001995 pBt = pPage->pBt; 001996 data = pPage->aData + pPage->hdrOffset; 001997 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating 001998 ** the b-tree page type. */ 001999 if( decodeFlags(pPage, data[0]) ){ 002000 return SQLITE_CORRUPT_PAGE(pPage); 002001 } 002002 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002003 pPage->maskPage = (u16)(pBt->pageSize - 1); 002004 pPage->nOverflow = 0; 002005 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize; 002006 pPage->aCellIdx = data + pPage->childPtrSize + 8; 002007 pPage->aDataEnd = pPage->aData + pBt->usableSize; 002008 pPage->aDataOfst = pPage->aData + pPage->childPtrSize; 002009 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 002010 ** number of cells on the page. */ 002011 pPage->nCell = get2byte(&data[3]); 002012 if( pPage->nCell>MX_CELL(pBt) ){ 002013 /* To many cells for a single page. The page must be corrupt */ 002014 return SQLITE_CORRUPT_PAGE(pPage); 002015 } 002016 testcase( pPage->nCell==MX_CELL(pBt) ); 002017 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only 002018 ** possible for a root page of a table that contains no rows) then the 002019 ** offset to the cell content area will equal the page size minus the 002020 ** bytes of reserved space. */ 002021 assert( pPage->nCell>0 002022 || get2byteNotZero(&data[5])==(int)pBt->usableSize 002023 || CORRUPT_DB ); 002024 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */ 002025 pPage->isInit = 1; 002026 if( pBt->db->flags & SQLITE_CellSizeCk ){ 002027 return btreeCellSizeCheck(pPage); 002028 } 002029 return SQLITE_OK; 002030 } 002031 002032 /* 002033 ** Set up a raw page so that it looks like a database page holding 002034 ** no entries. 002035 */ 002036 static void zeroPage(MemPage *pPage, int flags){ 002037 unsigned char *data = pPage->aData; 002038 BtShared *pBt = pPage->pBt; 002039 u8 hdr = pPage->hdrOffset; 002040 u16 first; 002041 002042 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 002043 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002044 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 002045 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 002046 assert( sqlite3_mutex_held(pBt->mutex) ); 002047 if( pBt->btsFlags & BTS_FAST_SECURE ){ 002048 memset(&data[hdr], 0, pBt->usableSize - hdr); 002049 } 002050 data[hdr] = (char)flags; 002051 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8); 002052 memset(&data[hdr+1], 0, 4); 002053 data[hdr+7] = 0; 002054 put2byte(&data[hdr+5], pBt->usableSize); 002055 pPage->nFree = (u16)(pBt->usableSize - first); 002056 decodeFlags(pPage, flags); 002057 pPage->cellOffset = first; 002058 pPage->aDataEnd = &data[pBt->usableSize]; 002059 pPage->aCellIdx = &data[first]; 002060 pPage->aDataOfst = &data[pPage->childPtrSize]; 002061 pPage->nOverflow = 0; 002062 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002063 pPage->maskPage = (u16)(pBt->pageSize - 1); 002064 pPage->nCell = 0; 002065 pPage->isInit = 1; 002066 } 002067 002068 002069 /* 002070 ** Convert a DbPage obtained from the pager into a MemPage used by 002071 ** the btree layer. 002072 */ 002073 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 002074 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002075 if( pgno!=pPage->pgno ){ 002076 pPage->aData = sqlite3PagerGetData(pDbPage); 002077 pPage->pDbPage = pDbPage; 002078 pPage->pBt = pBt; 002079 pPage->pgno = pgno; 002080 pPage->hdrOffset = pgno==1 ? 100 : 0; 002081 } 002082 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002083 return pPage; 002084 } 002085 002086 /* 002087 ** Get a page from the pager. Initialize the MemPage.pBt and 002088 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage(). 002089 ** 002090 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care 002091 ** about the content of the page at this time. So do not go to the disk 002092 ** to fetch the content. Just fill in the content with zeros for now. 002093 ** If in the future we call sqlite3PagerWrite() on this page, that 002094 ** means we have started to be concerned about content and the disk 002095 ** read should occur at that point. 002096 */ 002097 static int btreeGetPage( 002098 BtShared *pBt, /* The btree */ 002099 Pgno pgno, /* Number of the page to fetch */ 002100 MemPage **ppPage, /* Return the page in this parameter */ 002101 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002102 ){ 002103 int rc; 002104 DbPage *pDbPage; 002105 002106 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY ); 002107 assert( sqlite3_mutex_held(pBt->mutex) ); 002108 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags); 002109 if( rc ) return rc; 002110 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 002111 return SQLITE_OK; 002112 } 002113 002114 /* 002115 ** Retrieve a page from the pager cache. If the requested page is not 002116 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 002117 ** MemPage.aData elements if needed. 002118 */ 002119 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 002120 DbPage *pDbPage; 002121 assert( sqlite3_mutex_held(pBt->mutex) ); 002122 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 002123 if( pDbPage ){ 002124 return btreePageFromDbPage(pDbPage, pgno, pBt); 002125 } 002126 return 0; 002127 } 002128 002129 /* 002130 ** Return the size of the database file in pages. If there is any kind of 002131 ** error, return ((unsigned int)-1). 002132 */ 002133 static Pgno btreePagecount(BtShared *pBt){ 002134 return pBt->nPage; 002135 } 002136 u32 sqlite3BtreeLastPage(Btree *p){ 002137 assert( sqlite3BtreeHoldsMutex(p) ); 002138 assert( ((p->pBt->nPage)&0x80000000)==0 ); 002139 return btreePagecount(p->pBt); 002140 } 002141 002142 /* 002143 ** Get a page from the pager and initialize it. 002144 ** 002145 ** If pCur!=0 then the page is being fetched as part of a moveToChild() 002146 ** call. Do additional sanity checking on the page in this case. 002147 ** And if the fetch fails, this routine must decrement pCur->iPage. 002148 ** 002149 ** The page is fetched as read-write unless pCur is not NULL and is 002150 ** a read-only cursor. 002151 ** 002152 ** If an error occurs, then *ppPage is undefined. It 002153 ** may remain unchanged, or it may be set to an invalid value. 002154 */ 002155 static int getAndInitPage( 002156 BtShared *pBt, /* The database file */ 002157 Pgno pgno, /* Number of the page to get */ 002158 MemPage **ppPage, /* Write the page pointer here */ 002159 BtCursor *pCur, /* Cursor to receive the page, or NULL */ 002160 int bReadOnly /* True for a read-only page */ 002161 ){ 002162 int rc; 002163 DbPage *pDbPage; 002164 assert( sqlite3_mutex_held(pBt->mutex) ); 002165 assert( pCur==0 || ppPage==&pCur->pPage ); 002166 assert( pCur==0 || bReadOnly==pCur->curPagerFlags ); 002167 assert( pCur==0 || pCur->iPage>0 ); 002168 002169 if( pgno>btreePagecount(pBt) ){ 002170 rc = SQLITE_CORRUPT_BKPT; 002171 goto getAndInitPage_error1; 002172 } 002173 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); 002174 if( rc ){ 002175 goto getAndInitPage_error1; 002176 } 002177 *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002178 if( (*ppPage)->isInit==0 ){ 002179 btreePageFromDbPage(pDbPage, pgno, pBt); 002180 rc = btreeInitPage(*ppPage); 002181 if( rc!=SQLITE_OK ){ 002182 goto getAndInitPage_error2; 002183 } 002184 } 002185 assert( (*ppPage)->pgno==pgno ); 002186 assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) ); 002187 002188 /* If obtaining a child page for a cursor, we must verify that the page is 002189 ** compatible with the root page. */ 002190 if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){ 002191 rc = SQLITE_CORRUPT_PGNO(pgno); 002192 goto getAndInitPage_error2; 002193 } 002194 return SQLITE_OK; 002195 002196 getAndInitPage_error2: 002197 releasePage(*ppPage); 002198 getAndInitPage_error1: 002199 if( pCur ){ 002200 pCur->iPage--; 002201 pCur->pPage = pCur->apPage[pCur->iPage]; 002202 } 002203 testcase( pgno==0 ); 002204 assert( pgno!=0 || rc==SQLITE_CORRUPT ); 002205 return rc; 002206 } 002207 002208 /* 002209 ** Release a MemPage. This should be called once for each prior 002210 ** call to btreeGetPage. 002211 ** 002212 ** Page1 is a special case and must be released using releasePageOne(). 002213 */ 002214 static void releasePageNotNull(MemPage *pPage){ 002215 assert( pPage->aData ); 002216 assert( pPage->pBt ); 002217 assert( pPage->pDbPage!=0 ); 002218 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002219 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002220 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002221 sqlite3PagerUnrefNotNull(pPage->pDbPage); 002222 } 002223 static void releasePage(MemPage *pPage){ 002224 if( pPage ) releasePageNotNull(pPage); 002225 } 002226 static void releasePageOne(MemPage *pPage){ 002227 assert( pPage!=0 ); 002228 assert( pPage->aData ); 002229 assert( pPage->pBt ); 002230 assert( pPage->pDbPage!=0 ); 002231 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002232 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002233 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002234 sqlite3PagerUnrefPageOne(pPage->pDbPage); 002235 } 002236 002237 /* 002238 ** Get an unused page. 002239 ** 002240 ** This works just like btreeGetPage() with the addition: 002241 ** 002242 ** * If the page is already in use for some other purpose, immediately 002243 ** release it and return an SQLITE_CURRUPT error. 002244 ** * Make sure the isInit flag is clear 002245 */ 002246 static int btreeGetUnusedPage( 002247 BtShared *pBt, /* The btree */ 002248 Pgno pgno, /* Number of the page to fetch */ 002249 MemPage **ppPage, /* Return the page in this parameter */ 002250 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002251 ){ 002252 int rc = btreeGetPage(pBt, pgno, ppPage, flags); 002253 if( rc==SQLITE_OK ){ 002254 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 002255 releasePage(*ppPage); 002256 *ppPage = 0; 002257 return SQLITE_CORRUPT_BKPT; 002258 } 002259 (*ppPage)->isInit = 0; 002260 }else{ 002261 *ppPage = 0; 002262 } 002263 return rc; 002264 } 002265 002266 002267 /* 002268 ** During a rollback, when the pager reloads information into the cache 002269 ** so that the cache is restored to its original state at the start of 002270 ** the transaction, for each page restored this routine is called. 002271 ** 002272 ** This routine needs to reset the extra data section at the end of the 002273 ** page to agree with the restored data. 002274 */ 002275 static void pageReinit(DbPage *pData){ 002276 MemPage *pPage; 002277 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 002278 assert( sqlite3PagerPageRefcount(pData)>0 ); 002279 if( pPage->isInit ){ 002280 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002281 pPage->isInit = 0; 002282 if( sqlite3PagerPageRefcount(pData)>1 ){ 002283 /* pPage might not be a btree page; it might be an overflow page 002284 ** or ptrmap page or a free page. In those cases, the following 002285 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 002286 ** But no harm is done by this. And it is very important that 002287 ** btreeInitPage() be called on every btree page so we make 002288 ** the call for every page that comes in for re-initing. */ 002289 btreeInitPage(pPage); 002290 } 002291 } 002292 } 002293 002294 /* 002295 ** Invoke the busy handler for a btree. 002296 */ 002297 static int btreeInvokeBusyHandler(void *pArg){ 002298 BtShared *pBt = (BtShared*)pArg; 002299 assert( pBt->db ); 002300 assert( sqlite3_mutex_held(pBt->db->mutex) ); 002301 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler, 002302 sqlite3PagerFile(pBt->pPager)); 002303 } 002304 002305 /* 002306 ** Open a database file. 002307 ** 002308 ** zFilename is the name of the database file. If zFilename is NULL 002309 ** then an ephemeral database is created. The ephemeral database might 002310 ** be exclusively in memory, or it might use a disk-based memory cache. 002311 ** Either way, the ephemeral database will be automatically deleted 002312 ** when sqlite3BtreeClose() is called. 002313 ** 002314 ** If zFilename is ":memory:" then an in-memory database is created 002315 ** that is automatically destroyed when it is closed. 002316 ** 002317 ** The "flags" parameter is a bitmask that might contain bits like 002318 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY. 002319 ** 002320 ** If the database is already opened in the same database connection 002321 ** and we are in shared cache mode, then the open will fail with an 002322 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 002323 ** objects in the same database connection since doing so will lead 002324 ** to problems with locking. 002325 */ 002326 int sqlite3BtreeOpen( 002327 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */ 002328 const char *zFilename, /* Name of the file containing the BTree database */ 002329 sqlite3 *db, /* Associated database handle */ 002330 Btree **ppBtree, /* Pointer to new Btree object written here */ 002331 int flags, /* Options */ 002332 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 002333 ){ 002334 BtShared *pBt = 0; /* Shared part of btree structure */ 002335 Btree *p; /* Handle to return */ 002336 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 002337 int rc = SQLITE_OK; /* Result code from this function */ 002338 u8 nReserve; /* Byte of unused space on each page */ 002339 unsigned char zDbHeader[100]; /* Database header content */ 002340 002341 /* True if opening an ephemeral, temporary database */ 002342 const int isTempDb = zFilename==0 || zFilename[0]==0; 002343 002344 /* Set the variable isMemdb to true for an in-memory database, or 002345 ** false for a file-based database. 002346 */ 002347 #ifdef SQLITE_OMIT_MEMORYDB 002348 const int isMemdb = 0; 002349 #else 002350 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 002351 || (isTempDb && sqlite3TempInMemory(db)) 002352 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0; 002353 #endif 002354 002355 assert( db!=0 ); 002356 assert( pVfs!=0 ); 002357 assert( sqlite3_mutex_held(db->mutex) ); 002358 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 002359 002360 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 002361 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 002362 002363 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 002364 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 002365 002366 if( isMemdb ){ 002367 flags |= BTREE_MEMORY; 002368 } 002369 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 002370 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 002371 } 002372 p = sqlite3MallocZero(sizeof(Btree)); 002373 if( !p ){ 002374 return SQLITE_NOMEM_BKPT; 002375 } 002376 p->inTrans = TRANS_NONE; 002377 p->db = db; 002378 #ifndef SQLITE_OMIT_SHARED_CACHE 002379 p->lock.pBtree = p; 002380 p->lock.iTable = 1; 002381 #endif 002382 002383 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002384 /* 002385 ** If this Btree is a candidate for shared cache, try to find an 002386 ** existing BtShared object that we can share with 002387 */ 002388 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){ 002389 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 002390 int nFilename = sqlite3Strlen30(zFilename)+1; 002391 int nFullPathname = pVfs->mxPathname+1; 002392 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename)); 002393 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002394 002395 p->sharable = 1; 002396 if( !zFullPathname ){ 002397 sqlite3_free(p); 002398 return SQLITE_NOMEM_BKPT; 002399 } 002400 if( isMemdb ){ 002401 memcpy(zFullPathname, zFilename, nFilename); 002402 }else{ 002403 rc = sqlite3OsFullPathname(pVfs, zFilename, 002404 nFullPathname, zFullPathname); 002405 if( rc ){ 002406 if( rc==SQLITE_OK_SYMLINK ){ 002407 rc = SQLITE_OK; 002408 }else{ 002409 sqlite3_free(zFullPathname); 002410 sqlite3_free(p); 002411 return rc; 002412 } 002413 } 002414 } 002415 #if SQLITE_THREADSAFE 002416 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 002417 sqlite3_mutex_enter(mutexOpen); 002418 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 002419 sqlite3_mutex_enter(mutexShared); 002420 #endif 002421 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 002422 assert( pBt->nRef>0 ); 002423 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0)) 002424 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 002425 int iDb; 002426 for(iDb=db->nDb-1; iDb>=0; iDb--){ 002427 Btree *pExisting = db->aDb[iDb].pBt; 002428 if( pExisting && pExisting->pBt==pBt ){ 002429 sqlite3_mutex_leave(mutexShared); 002430 sqlite3_mutex_leave(mutexOpen); 002431 sqlite3_free(zFullPathname); 002432 sqlite3_free(p); 002433 return SQLITE_CONSTRAINT; 002434 } 002435 } 002436 p->pBt = pBt; 002437 pBt->nRef++; 002438 break; 002439 } 002440 } 002441 sqlite3_mutex_leave(mutexShared); 002442 sqlite3_free(zFullPathname); 002443 } 002444 #ifdef SQLITE_DEBUG 002445 else{ 002446 /* In debug mode, we mark all persistent databases as sharable 002447 ** even when they are not. This exercises the locking code and 002448 ** gives more opportunity for asserts(sqlite3_mutex_held()) 002449 ** statements to find locking problems. 002450 */ 002451 p->sharable = 1; 002452 } 002453 #endif 002454 } 002455 #endif 002456 if( pBt==0 ){ 002457 /* 002458 ** The following asserts make sure that structures used by the btree are 002459 ** the right size. This is to guard against size changes that result 002460 ** when compiling on a different architecture. 002461 */ 002462 assert( sizeof(i64)==8 ); 002463 assert( sizeof(u64)==8 ); 002464 assert( sizeof(u32)==4 ); 002465 assert( sizeof(u16)==2 ); 002466 assert( sizeof(Pgno)==4 ); 002467 002468 pBt = sqlite3MallocZero( sizeof(*pBt) ); 002469 if( pBt==0 ){ 002470 rc = SQLITE_NOMEM_BKPT; 002471 goto btree_open_out; 002472 } 002473 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 002474 sizeof(MemPage), flags, vfsFlags, pageReinit); 002475 if( rc==SQLITE_OK ){ 002476 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap); 002477 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 002478 } 002479 if( rc!=SQLITE_OK ){ 002480 goto btree_open_out; 002481 } 002482 pBt->openFlags = (u8)flags; 002483 pBt->db = db; 002484 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 002485 p->pBt = pBt; 002486 002487 pBt->pCursor = 0; 002488 pBt->pPage1 = 0; 002489 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY; 002490 #if defined(SQLITE_SECURE_DELETE) 002491 pBt->btsFlags |= BTS_SECURE_DELETE; 002492 #elif defined(SQLITE_FAST_SECURE_DELETE) 002493 pBt->btsFlags |= BTS_OVERWRITE; 002494 #endif 002495 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 002496 ** determined by the 2-byte integer located at an offset of 16 bytes from 002497 ** the beginning of the database file. */ 002498 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 002499 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 002500 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 002501 pBt->pageSize = 0; 002502 #ifndef SQLITE_OMIT_AUTOVACUUM 002503 /* If the magic name ":memory:" will create an in-memory database, then 002504 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 002505 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 002506 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 002507 ** regular file-name. In this case the auto-vacuum applies as per normal. 002508 */ 002509 if( zFilename && !isMemdb ){ 002510 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 002511 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 002512 } 002513 #endif 002514 nReserve = 0; 002515 }else{ 002516 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is 002517 ** determined by the one-byte unsigned integer found at an offset of 20 002518 ** into the database file header. */ 002519 nReserve = zDbHeader[20]; 002520 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 002521 #ifndef SQLITE_OMIT_AUTOVACUUM 002522 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 002523 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 002524 #endif 002525 } 002526 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 002527 if( rc ) goto btree_open_out; 002528 pBt->usableSize = pBt->pageSize - nReserve; 002529 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 002530 002531 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002532 /* Add the new BtShared object to the linked list sharable BtShareds. 002533 */ 002534 pBt->nRef = 1; 002535 if( p->sharable ){ 002536 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002537 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);) 002538 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 002539 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 002540 if( pBt->mutex==0 ){ 002541 rc = SQLITE_NOMEM_BKPT; 002542 goto btree_open_out; 002543 } 002544 } 002545 sqlite3_mutex_enter(mutexShared); 002546 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 002547 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 002548 sqlite3_mutex_leave(mutexShared); 002549 } 002550 #endif 002551 } 002552 002553 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002554 /* If the new Btree uses a sharable pBtShared, then link the new 002555 ** Btree into the list of all sharable Btrees for the same connection. 002556 ** The list is kept in ascending order by pBt address. 002557 */ 002558 if( p->sharable ){ 002559 int i; 002560 Btree *pSib; 002561 for(i=0; i<db->nDb; i++){ 002562 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 002563 while( pSib->pPrev ){ pSib = pSib->pPrev; } 002564 if( (uptr)p->pBt<(uptr)pSib->pBt ){ 002565 p->pNext = pSib; 002566 p->pPrev = 0; 002567 pSib->pPrev = p; 002568 }else{ 002569 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){ 002570 pSib = pSib->pNext; 002571 } 002572 p->pNext = pSib->pNext; 002573 p->pPrev = pSib; 002574 if( p->pNext ){ 002575 p->pNext->pPrev = p; 002576 } 002577 pSib->pNext = p; 002578 } 002579 break; 002580 } 002581 } 002582 } 002583 #endif 002584 *ppBtree = p; 002585 002586 btree_open_out: 002587 if( rc!=SQLITE_OK ){ 002588 if( pBt && pBt->pPager ){ 002589 sqlite3PagerClose(pBt->pPager, 0); 002590 } 002591 sqlite3_free(pBt); 002592 sqlite3_free(p); 002593 *ppBtree = 0; 002594 }else{ 002595 sqlite3_file *pFile; 002596 002597 /* If the B-Tree was successfully opened, set the pager-cache size to the 002598 ** default value. Except, when opening on an existing shared pager-cache, 002599 ** do not change the pager-cache size. 002600 */ 002601 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 002602 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE); 002603 } 002604 002605 pFile = sqlite3PagerFile(pBt->pPager); 002606 if( pFile->pMethods ){ 002607 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db); 002608 } 002609 } 002610 if( mutexOpen ){ 002611 assert( sqlite3_mutex_held(mutexOpen) ); 002612 sqlite3_mutex_leave(mutexOpen); 002613 } 002614 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 ); 002615 return rc; 002616 } 002617 002618 /* 002619 ** Decrement the BtShared.nRef counter. When it reaches zero, 002620 ** remove the BtShared structure from the sharing list. Return 002621 ** true if the BtShared.nRef counter reaches zero and return 002622 ** false if it is still positive. 002623 */ 002624 static int removeFromSharingList(BtShared *pBt){ 002625 #ifndef SQLITE_OMIT_SHARED_CACHE 002626 MUTEX_LOGIC( sqlite3_mutex *pMaster; ) 002627 BtShared *pList; 002628 int removed = 0; 002629 002630 assert( sqlite3_mutex_notheld(pBt->mutex) ); 002631 MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); ) 002632 sqlite3_mutex_enter(pMaster); 002633 pBt->nRef--; 002634 if( pBt->nRef<=0 ){ 002635 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 002636 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 002637 }else{ 002638 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 002639 while( ALWAYS(pList) && pList->pNext!=pBt ){ 002640 pList=pList->pNext; 002641 } 002642 if( ALWAYS(pList) ){ 002643 pList->pNext = pBt->pNext; 002644 } 002645 } 002646 if( SQLITE_THREADSAFE ){ 002647 sqlite3_mutex_free(pBt->mutex); 002648 } 002649 removed = 1; 002650 } 002651 sqlite3_mutex_leave(pMaster); 002652 return removed; 002653 #else 002654 return 1; 002655 #endif 002656 } 002657 002658 /* 002659 ** Make sure pBt->pTmpSpace points to an allocation of 002660 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child 002661 ** pointer. 002662 */ 002663 static void allocateTempSpace(BtShared *pBt){ 002664 if( !pBt->pTmpSpace ){ 002665 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 002666 002667 /* One of the uses of pBt->pTmpSpace is to format cells before 002668 ** inserting them into a leaf page (function fillInCell()). If 002669 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes 002670 ** by the various routines that manipulate binary cells. Which 002671 ** can mean that fillInCell() only initializes the first 2 or 3 002672 ** bytes of pTmpSpace, but that the first 4 bytes are copied from 002673 ** it into a database page. This is not actually a problem, but it 002674 ** does cause a valgrind error when the 1 or 2 bytes of unitialized 002675 ** data is passed to system call write(). So to avoid this error, 002676 ** zero the first 4 bytes of temp space here. 002677 ** 002678 ** Also: Provide four bytes of initialized space before the 002679 ** beginning of pTmpSpace as an area available to prepend the 002680 ** left-child pointer to the beginning of a cell. 002681 */ 002682 if( pBt->pTmpSpace ){ 002683 memset(pBt->pTmpSpace, 0, 8); 002684 pBt->pTmpSpace += 4; 002685 } 002686 } 002687 } 002688 002689 /* 002690 ** Free the pBt->pTmpSpace allocation 002691 */ 002692 static void freeTempSpace(BtShared *pBt){ 002693 if( pBt->pTmpSpace ){ 002694 pBt->pTmpSpace -= 4; 002695 sqlite3PageFree(pBt->pTmpSpace); 002696 pBt->pTmpSpace = 0; 002697 } 002698 } 002699 002700 /* 002701 ** Close an open database and invalidate all cursors. 002702 */ 002703 int sqlite3BtreeClose(Btree *p){ 002704 BtShared *pBt = p->pBt; 002705 BtCursor *pCur; 002706 002707 /* Close all cursors opened via this handle. */ 002708 assert( sqlite3_mutex_held(p->db->mutex) ); 002709 sqlite3BtreeEnter(p); 002710 pCur = pBt->pCursor; 002711 while( pCur ){ 002712 BtCursor *pTmp = pCur; 002713 pCur = pCur->pNext; 002714 if( pTmp->pBtree==p ){ 002715 sqlite3BtreeCloseCursor(pTmp); 002716 } 002717 } 002718 002719 /* Rollback any active transaction and free the handle structure. 002720 ** The call to sqlite3BtreeRollback() drops any table-locks held by 002721 ** this handle. 002722 */ 002723 sqlite3BtreeRollback(p, SQLITE_OK, 0); 002724 sqlite3BtreeLeave(p); 002725 002726 /* If there are still other outstanding references to the shared-btree 002727 ** structure, return now. The remainder of this procedure cleans 002728 ** up the shared-btree. 002729 */ 002730 assert( p->wantToLock==0 && p->locked==0 ); 002731 if( !p->sharable || removeFromSharingList(pBt) ){ 002732 /* The pBt is no longer on the sharing list, so we can access 002733 ** it without having to hold the mutex. 002734 ** 002735 ** Clean out and delete the BtShared object. 002736 */ 002737 assert( !pBt->pCursor ); 002738 sqlite3PagerClose(pBt->pPager, p->db); 002739 if( pBt->xFreeSchema && pBt->pSchema ){ 002740 pBt->xFreeSchema(pBt->pSchema); 002741 } 002742 sqlite3DbFree(0, pBt->pSchema); 002743 freeTempSpace(pBt); 002744 sqlite3_free(pBt); 002745 } 002746 002747 #ifndef SQLITE_OMIT_SHARED_CACHE 002748 assert( p->wantToLock==0 ); 002749 assert( p->locked==0 ); 002750 if( p->pPrev ) p->pPrev->pNext = p->pNext; 002751 if( p->pNext ) p->pNext->pPrev = p->pPrev; 002752 #endif 002753 002754 sqlite3_free(p); 002755 return SQLITE_OK; 002756 } 002757 002758 /* 002759 ** Change the "soft" limit on the number of pages in the cache. 002760 ** Unused and unmodified pages will be recycled when the number of 002761 ** pages in the cache exceeds this soft limit. But the size of the 002762 ** cache is allowed to grow larger than this limit if it contains 002763 ** dirty pages or pages still in active use. 002764 */ 002765 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 002766 BtShared *pBt = p->pBt; 002767 assert( sqlite3_mutex_held(p->db->mutex) ); 002768 sqlite3BtreeEnter(p); 002769 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 002770 sqlite3BtreeLeave(p); 002771 return SQLITE_OK; 002772 } 002773 002774 /* 002775 ** Change the "spill" limit on the number of pages in the cache. 002776 ** If the number of pages exceeds this limit during a write transaction, 002777 ** the pager might attempt to "spill" pages to the journal early in 002778 ** order to free up memory. 002779 ** 002780 ** The value returned is the current spill size. If zero is passed 002781 ** as an argument, no changes are made to the spill size setting, so 002782 ** using mxPage of 0 is a way to query the current spill size. 002783 */ 002784 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){ 002785 BtShared *pBt = p->pBt; 002786 int res; 002787 assert( sqlite3_mutex_held(p->db->mutex) ); 002788 sqlite3BtreeEnter(p); 002789 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage); 002790 sqlite3BtreeLeave(p); 002791 return res; 002792 } 002793 002794 #if SQLITE_MAX_MMAP_SIZE>0 002795 /* 002796 ** Change the limit on the amount of the database file that may be 002797 ** memory mapped. 002798 */ 002799 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){ 002800 BtShared *pBt = p->pBt; 002801 assert( sqlite3_mutex_held(p->db->mutex) ); 002802 sqlite3BtreeEnter(p); 002803 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap); 002804 sqlite3BtreeLeave(p); 002805 return SQLITE_OK; 002806 } 002807 #endif /* SQLITE_MAX_MMAP_SIZE>0 */ 002808 002809 /* 002810 ** Change the way data is synced to disk in order to increase or decrease 002811 ** how well the database resists damage due to OS crashes and power 002812 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 002813 ** there is a high probability of damage) Level 2 is the default. There 002814 ** is a very low but non-zero probability of damage. Level 3 reduces the 002815 ** probability of damage to near zero but with a write performance reduction. 002816 */ 002817 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 002818 int sqlite3BtreeSetPagerFlags( 002819 Btree *p, /* The btree to set the safety level on */ 002820 unsigned pgFlags /* Various PAGER_* flags */ 002821 ){ 002822 BtShared *pBt = p->pBt; 002823 assert( sqlite3_mutex_held(p->db->mutex) ); 002824 sqlite3BtreeEnter(p); 002825 sqlite3PagerSetFlags(pBt->pPager, pgFlags); 002826 sqlite3BtreeLeave(p); 002827 return SQLITE_OK; 002828 } 002829 #endif 002830 002831 /* 002832 ** Change the default pages size and the number of reserved bytes per page. 002833 ** Or, if the page size has already been fixed, return SQLITE_READONLY 002834 ** without changing anything. 002835 ** 002836 ** The page size must be a power of 2 between 512 and 65536. If the page 002837 ** size supplied does not meet this constraint then the page size is not 002838 ** changed. 002839 ** 002840 ** Page sizes are constrained to be a power of two so that the region 002841 ** of the database file used for locking (beginning at PENDING_BYTE, 002842 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 002843 ** at the beginning of a page. 002844 ** 002845 ** If parameter nReserve is less than zero, then the number of reserved 002846 ** bytes per page is left unchanged. 002847 ** 002848 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size 002849 ** and autovacuum mode can no longer be changed. 002850 */ 002851 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 002852 int rc = SQLITE_OK; 002853 BtShared *pBt = p->pBt; 002854 assert( nReserve>=-1 && nReserve<=255 ); 002855 sqlite3BtreeEnter(p); 002856 #if SQLITE_HAS_CODEC 002857 if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve; 002858 #endif 002859 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){ 002860 sqlite3BtreeLeave(p); 002861 return SQLITE_READONLY; 002862 } 002863 if( nReserve<0 ){ 002864 nReserve = pBt->pageSize - pBt->usableSize; 002865 } 002866 assert( nReserve>=0 && nReserve<=255 ); 002867 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 002868 ((pageSize-1)&pageSize)==0 ){ 002869 assert( (pageSize & 7)==0 ); 002870 assert( !pBt->pCursor ); 002871 pBt->pageSize = (u32)pageSize; 002872 freeTempSpace(pBt); 002873 } 002874 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 002875 pBt->usableSize = pBt->pageSize - (u16)nReserve; 002876 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED; 002877 sqlite3BtreeLeave(p); 002878 return rc; 002879 } 002880 002881 /* 002882 ** Return the currently defined page size 002883 */ 002884 int sqlite3BtreeGetPageSize(Btree *p){ 002885 return p->pBt->pageSize; 002886 } 002887 002888 /* 002889 ** This function is similar to sqlite3BtreeGetReserve(), except that it 002890 ** may only be called if it is guaranteed that the b-tree mutex is already 002891 ** held. 002892 ** 002893 ** This is useful in one special case in the backup API code where it is 002894 ** known that the shared b-tree mutex is held, but the mutex on the 002895 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter() 002896 ** were to be called, it might collide with some other operation on the 002897 ** database handle that owns *p, causing undefined behavior. 002898 */ 002899 int sqlite3BtreeGetReserveNoMutex(Btree *p){ 002900 int n; 002901 assert( sqlite3_mutex_held(p->pBt->mutex) ); 002902 n = p->pBt->pageSize - p->pBt->usableSize; 002903 return n; 002904 } 002905 002906 /* 002907 ** Return the number of bytes of space at the end of every page that 002908 ** are intentually left unused. This is the "reserved" space that is 002909 ** sometimes used by extensions. 002910 ** 002911 ** If SQLITE_HAS_MUTEX is defined then the number returned is the 002912 ** greater of the current reserved space and the maximum requested 002913 ** reserve space. 002914 */ 002915 int sqlite3BtreeGetOptimalReserve(Btree *p){ 002916 int n; 002917 sqlite3BtreeEnter(p); 002918 n = sqlite3BtreeGetReserveNoMutex(p); 002919 #ifdef SQLITE_HAS_CODEC 002920 if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve; 002921 #endif 002922 sqlite3BtreeLeave(p); 002923 return n; 002924 } 002925 002926 002927 /* 002928 ** Set the maximum page count for a database if mxPage is positive. 002929 ** No changes are made if mxPage is 0 or negative. 002930 ** Regardless of the value of mxPage, return the maximum page count. 002931 */ 002932 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 002933 int n; 002934 sqlite3BtreeEnter(p); 002935 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 002936 sqlite3BtreeLeave(p); 002937 return n; 002938 } 002939 002940 /* 002941 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags: 002942 ** 002943 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared 002944 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared 002945 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set 002946 ** newFlag==(-1) No changes 002947 ** 002948 ** This routine acts as a query if newFlag is less than zero 002949 ** 002950 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but 002951 ** freelist leaf pages are not written back to the database. Thus in-page 002952 ** deleted content is cleared, but freelist deleted content is not. 002953 ** 002954 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition 002955 ** that freelist leaf pages are written back into the database, increasing 002956 ** the amount of disk I/O. 002957 */ 002958 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 002959 int b; 002960 if( p==0 ) return 0; 002961 sqlite3BtreeEnter(p); 002962 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 ); 002963 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) ); 002964 if( newFlag>=0 ){ 002965 p->pBt->btsFlags &= ~BTS_FAST_SECURE; 002966 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag; 002967 } 002968 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE; 002969 sqlite3BtreeLeave(p); 002970 return b; 002971 } 002972 002973 /* 002974 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 002975 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 002976 ** is disabled. The default value for the auto-vacuum property is 002977 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 002978 */ 002979 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 002980 #ifdef SQLITE_OMIT_AUTOVACUUM 002981 return SQLITE_READONLY; 002982 #else 002983 BtShared *pBt = p->pBt; 002984 int rc = SQLITE_OK; 002985 u8 av = (u8)autoVacuum; 002986 002987 sqlite3BtreeEnter(p); 002988 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){ 002989 rc = SQLITE_READONLY; 002990 }else{ 002991 pBt->autoVacuum = av ?1:0; 002992 pBt->incrVacuum = av==2 ?1:0; 002993 } 002994 sqlite3BtreeLeave(p); 002995 return rc; 002996 #endif 002997 } 002998 002999 /* 003000 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 003001 ** enabled 1 is returned. Otherwise 0. 003002 */ 003003 int sqlite3BtreeGetAutoVacuum(Btree *p){ 003004 #ifdef SQLITE_OMIT_AUTOVACUUM 003005 return BTREE_AUTOVACUUM_NONE; 003006 #else 003007 int rc; 003008 sqlite3BtreeEnter(p); 003009 rc = ( 003010 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 003011 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 003012 BTREE_AUTOVACUUM_INCR 003013 ); 003014 sqlite3BtreeLeave(p); 003015 return rc; 003016 #endif 003017 } 003018 003019 /* 003020 ** If the user has not set the safety-level for this database connection 003021 ** using "PRAGMA synchronous", and if the safety-level is not already 003022 ** set to the value passed to this function as the second parameter, 003023 ** set it so. 003024 */ 003025 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \ 003026 && !defined(SQLITE_OMIT_WAL) 003027 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){ 003028 sqlite3 *db; 003029 Db *pDb; 003030 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){ 003031 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; } 003032 if( pDb->bSyncSet==0 003033 && pDb->safety_level!=safety_level 003034 && pDb!=&db->aDb[1] 003035 ){ 003036 pDb->safety_level = safety_level; 003037 sqlite3PagerSetFlags(pBt->pPager, 003038 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK)); 003039 } 003040 } 003041 } 003042 #else 003043 # define setDefaultSyncFlag(pBt,safety_level) 003044 #endif 003045 003046 /* Forward declaration */ 003047 static int newDatabase(BtShared*); 003048 003049 003050 /* 003051 ** Get a reference to pPage1 of the database file. This will 003052 ** also acquire a readlock on that file. 003053 ** 003054 ** SQLITE_OK is returned on success. If the file is not a 003055 ** well-formed database file, then SQLITE_CORRUPT is returned. 003056 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 003057 ** is returned if we run out of memory. 003058 */ 003059 static int lockBtree(BtShared *pBt){ 003060 int rc; /* Result code from subfunctions */ 003061 MemPage *pPage1; /* Page 1 of the database file */ 003062 u32 nPage; /* Number of pages in the database */ 003063 u32 nPageFile = 0; /* Number of pages in the database file */ 003064 u32 nPageHeader; /* Number of pages in the database according to hdr */ 003065 003066 assert( sqlite3_mutex_held(pBt->mutex) ); 003067 assert( pBt->pPage1==0 ); 003068 rc = sqlite3PagerSharedLock(pBt->pPager); 003069 if( rc!=SQLITE_OK ) return rc; 003070 rc = btreeGetPage(pBt, 1, &pPage1, 0); 003071 if( rc!=SQLITE_OK ) return rc; 003072 003073 /* Do some checking to help insure the file we opened really is 003074 ** a valid database file. 003075 */ 003076 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData); 003077 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile); 003078 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 003079 nPage = nPageFile; 003080 } 003081 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){ 003082 nPage = 0; 003083 } 003084 if( nPage>0 ){ 003085 u32 pageSize; 003086 u32 usableSize; 003087 u8 *page1 = pPage1->aData; 003088 rc = SQLITE_NOTADB; 003089 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins 003090 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d 003091 ** 61 74 20 33 00. */ 003092 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 003093 goto page1_init_failed; 003094 } 003095 003096 #ifdef SQLITE_OMIT_WAL 003097 if( page1[18]>1 ){ 003098 pBt->btsFlags |= BTS_READ_ONLY; 003099 } 003100 if( page1[19]>1 ){ 003101 goto page1_init_failed; 003102 } 003103 #else 003104 if( page1[18]>2 ){ 003105 pBt->btsFlags |= BTS_READ_ONLY; 003106 } 003107 if( page1[19]>2 ){ 003108 goto page1_init_failed; 003109 } 003110 003111 /* If the write version is set to 2, this database should be accessed 003112 ** in WAL mode. If the log is not already open, open it now. Then 003113 ** return SQLITE_OK and return without populating BtShared.pPage1. 003114 ** The caller detects this and calls this function again. This is 003115 ** required as the version of page 1 currently in the page1 buffer 003116 ** may not be the latest version - there may be a newer one in the log 003117 ** file. 003118 */ 003119 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ 003120 int isOpen = 0; 003121 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 003122 if( rc!=SQLITE_OK ){ 003123 goto page1_init_failed; 003124 }else{ 003125 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); 003126 if( isOpen==0 ){ 003127 releasePageOne(pPage1); 003128 return SQLITE_OK; 003129 } 003130 } 003131 rc = SQLITE_NOTADB; 003132 }else{ 003133 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1); 003134 } 003135 #endif 003136 003137 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload 003138 ** fractions and the leaf payload fraction values must be 64, 32, and 32. 003139 ** 003140 ** The original design allowed these amounts to vary, but as of 003141 ** version 3.6.0, we require them to be fixed. 003142 */ 003143 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 003144 goto page1_init_failed; 003145 } 003146 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 003147 ** determined by the 2-byte integer located at an offset of 16 bytes from 003148 ** the beginning of the database file. */ 003149 pageSize = (page1[16]<<8) | (page1[17]<<16); 003150 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two 003151 ** between 512 and 65536 inclusive. */ 003152 if( ((pageSize-1)&pageSize)!=0 003153 || pageSize>SQLITE_MAX_PAGE_SIZE 003154 || pageSize<=256 003155 ){ 003156 goto page1_init_failed; 003157 } 003158 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003159 assert( (pageSize & 7)==0 ); 003160 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte 003161 ** integer at offset 20 is the number of bytes of space at the end of 003162 ** each page to reserve for extensions. 003163 ** 003164 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is 003165 ** determined by the one-byte unsigned integer found at an offset of 20 003166 ** into the database file header. */ 003167 usableSize = pageSize - page1[20]; 003168 if( (u32)pageSize!=pBt->pageSize ){ 003169 /* After reading the first page of the database assuming a page size 003170 ** of BtShared.pageSize, we have discovered that the page-size is 003171 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 003172 ** zero and return SQLITE_OK. The caller will call this function 003173 ** again with the correct page-size. 003174 */ 003175 releasePageOne(pPage1); 003176 pBt->usableSize = usableSize; 003177 pBt->pageSize = pageSize; 003178 freeTempSpace(pBt); 003179 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 003180 pageSize-usableSize); 003181 return rc; 003182 } 003183 if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){ 003184 rc = SQLITE_CORRUPT_BKPT; 003185 goto page1_init_failed; 003186 } 003187 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to 003188 ** be less than 480. In other words, if the page size is 512, then the 003189 ** reserved space size cannot exceed 32. */ 003190 if( usableSize<480 ){ 003191 goto page1_init_failed; 003192 } 003193 pBt->pageSize = pageSize; 003194 pBt->usableSize = usableSize; 003195 #ifndef SQLITE_OMIT_AUTOVACUUM 003196 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 003197 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 003198 #endif 003199 } 003200 003201 /* maxLocal is the maximum amount of payload to store locally for 003202 ** a cell. Make sure it is small enough so that at least minFanout 003203 ** cells can will fit on one page. We assume a 10-byte page header. 003204 ** Besides the payload, the cell must store: 003205 ** 2-byte pointer to the cell 003206 ** 4-byte child pointer 003207 ** 9-byte nKey value 003208 ** 4-byte nData value 003209 ** 4-byte overflow page pointer 003210 ** So a cell consists of a 2-byte pointer, a header which is as much as 003211 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 003212 ** page pointer. 003213 */ 003214 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 003215 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 003216 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 003217 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 003218 if( pBt->maxLocal>127 ){ 003219 pBt->max1bytePayload = 127; 003220 }else{ 003221 pBt->max1bytePayload = (u8)pBt->maxLocal; 003222 } 003223 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 003224 pBt->pPage1 = pPage1; 003225 pBt->nPage = nPage; 003226 return SQLITE_OK; 003227 003228 page1_init_failed: 003229 releasePageOne(pPage1); 003230 pBt->pPage1 = 0; 003231 return rc; 003232 } 003233 003234 #ifndef NDEBUG 003235 /* 003236 ** Return the number of cursors open on pBt. This is for use 003237 ** in assert() expressions, so it is only compiled if NDEBUG is not 003238 ** defined. 003239 ** 003240 ** Only write cursors are counted if wrOnly is true. If wrOnly is 003241 ** false then all cursors are counted. 003242 ** 003243 ** For the purposes of this routine, a cursor is any cursor that 003244 ** is capable of reading or writing to the database. Cursors that 003245 ** have been tripped into the CURSOR_FAULT state are not counted. 003246 */ 003247 static int countValidCursors(BtShared *pBt, int wrOnly){ 003248 BtCursor *pCur; 003249 int r = 0; 003250 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 003251 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0) 003252 && pCur->eState!=CURSOR_FAULT ) r++; 003253 } 003254 return r; 003255 } 003256 #endif 003257 003258 /* 003259 ** If there are no outstanding cursors and we are not in the middle 003260 ** of a transaction but there is a read lock on the database, then 003261 ** this routine unrefs the first page of the database file which 003262 ** has the effect of releasing the read lock. 003263 ** 003264 ** If there is a transaction in progress, this routine is a no-op. 003265 */ 003266 static void unlockBtreeIfUnused(BtShared *pBt){ 003267 assert( sqlite3_mutex_held(pBt->mutex) ); 003268 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE ); 003269 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 003270 MemPage *pPage1 = pBt->pPage1; 003271 assert( pPage1->aData ); 003272 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 003273 pBt->pPage1 = 0; 003274 releasePageOne(pPage1); 003275 } 003276 } 003277 003278 /* 003279 ** If pBt points to an empty file then convert that empty file 003280 ** into a new empty database by initializing the first page of 003281 ** the database. 003282 */ 003283 static int newDatabase(BtShared *pBt){ 003284 MemPage *pP1; 003285 unsigned char *data; 003286 int rc; 003287 003288 assert( sqlite3_mutex_held(pBt->mutex) ); 003289 if( pBt->nPage>0 ){ 003290 return SQLITE_OK; 003291 } 003292 pP1 = pBt->pPage1; 003293 assert( pP1!=0 ); 003294 data = pP1->aData; 003295 rc = sqlite3PagerWrite(pP1->pDbPage); 003296 if( rc ) return rc; 003297 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 003298 assert( sizeof(zMagicHeader)==16 ); 003299 data[16] = (u8)((pBt->pageSize>>8)&0xff); 003300 data[17] = (u8)((pBt->pageSize>>16)&0xff); 003301 data[18] = 1; 003302 data[19] = 1; 003303 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 003304 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 003305 data[21] = 64; 003306 data[22] = 32; 003307 data[23] = 32; 003308 memset(&data[24], 0, 100-24); 003309 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 003310 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003311 #ifndef SQLITE_OMIT_AUTOVACUUM 003312 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 003313 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 003314 put4byte(&data[36 + 4*4], pBt->autoVacuum); 003315 put4byte(&data[36 + 7*4], pBt->incrVacuum); 003316 #endif 003317 pBt->nPage = 1; 003318 data[31] = 1; 003319 return SQLITE_OK; 003320 } 003321 003322 /* 003323 ** Initialize the first page of the database file (creating a database 003324 ** consisting of a single page and no schema objects). Return SQLITE_OK 003325 ** if successful, or an SQLite error code otherwise. 003326 */ 003327 int sqlite3BtreeNewDb(Btree *p){ 003328 int rc; 003329 sqlite3BtreeEnter(p); 003330 p->pBt->nPage = 0; 003331 rc = newDatabase(p->pBt); 003332 sqlite3BtreeLeave(p); 003333 return rc; 003334 } 003335 003336 /* 003337 ** Attempt to start a new transaction. A write-transaction 003338 ** is started if the second argument is nonzero, otherwise a read- 003339 ** transaction. If the second argument is 2 or more and exclusive 003340 ** transaction is started, meaning that no other process is allowed 003341 ** to access the database. A preexisting transaction may not be 003342 ** upgraded to exclusive by calling this routine a second time - the 003343 ** exclusivity flag only works for a new transaction. 003344 ** 003345 ** A write-transaction must be started before attempting any 003346 ** changes to the database. None of the following routines 003347 ** will work unless a transaction is started first: 003348 ** 003349 ** sqlite3BtreeCreateTable() 003350 ** sqlite3BtreeCreateIndex() 003351 ** sqlite3BtreeClearTable() 003352 ** sqlite3BtreeDropTable() 003353 ** sqlite3BtreeInsert() 003354 ** sqlite3BtreeDelete() 003355 ** sqlite3BtreeUpdateMeta() 003356 ** 003357 ** If an initial attempt to acquire the lock fails because of lock contention 003358 ** and the database was previously unlocked, then invoke the busy handler 003359 ** if there is one. But if there was previously a read-lock, do not 003360 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 003361 ** returned when there is already a read-lock in order to avoid a deadlock. 003362 ** 003363 ** Suppose there are two processes A and B. A has a read lock and B has 003364 ** a reserved lock. B tries to promote to exclusive but is blocked because 003365 ** of A's read lock. A tries to promote to reserved but is blocked by B. 003366 ** One or the other of the two processes must give way or there can be 003367 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 003368 ** when A already has a read lock, we encourage A to give up and let B 003369 ** proceed. 003370 */ 003371 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ 003372 BtShared *pBt = p->pBt; 003373 int rc = SQLITE_OK; 003374 003375 sqlite3BtreeEnter(p); 003376 btreeIntegrity(p); 003377 003378 /* If the btree is already in a write-transaction, or it 003379 ** is already in a read-transaction and a read-transaction 003380 ** is requested, this is a no-op. 003381 */ 003382 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 003383 goto trans_begun; 003384 } 003385 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 ); 003386 003387 if( (p->db->flags & SQLITE_ResetDatabase) 003388 && sqlite3PagerIsreadonly(pBt->pPager)==0 003389 ){ 003390 pBt->btsFlags &= ~BTS_READ_ONLY; 003391 } 003392 003393 /* Write transactions are not possible on a read-only database */ 003394 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){ 003395 rc = SQLITE_READONLY; 003396 goto trans_begun; 003397 } 003398 003399 #ifndef SQLITE_OMIT_SHARED_CACHE 003400 { 003401 sqlite3 *pBlock = 0; 003402 /* If another database handle has already opened a write transaction 003403 ** on this shared-btree structure and a second write transaction is 003404 ** requested, return SQLITE_LOCKED. 003405 */ 003406 if( (wrflag && pBt->inTransaction==TRANS_WRITE) 003407 || (pBt->btsFlags & BTS_PENDING)!=0 003408 ){ 003409 pBlock = pBt->pWriter->db; 003410 }else if( wrflag>1 ){ 003411 BtLock *pIter; 003412 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 003413 if( pIter->pBtree!=p ){ 003414 pBlock = pIter->pBtree->db; 003415 break; 003416 } 003417 } 003418 } 003419 if( pBlock ){ 003420 sqlite3ConnectionBlocked(p->db, pBlock); 003421 rc = SQLITE_LOCKED_SHAREDCACHE; 003422 goto trans_begun; 003423 } 003424 } 003425 #endif 003426 003427 /* Any read-only or read-write transaction implies a read-lock on 003428 ** page 1. So if some other shared-cache client already has a write-lock 003429 ** on page 1, the transaction cannot be opened. */ 003430 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 003431 if( SQLITE_OK!=rc ) goto trans_begun; 003432 003433 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY; 003434 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY; 003435 do { 003436 /* Call lockBtree() until either pBt->pPage1 is populated or 003437 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 003438 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 003439 ** reading page 1 it discovers that the page-size of the database 003440 ** file is not pBt->pageSize. In this case lockBtree() will update 003441 ** pBt->pageSize to the page-size of the file on disk. 003442 */ 003443 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 003444 003445 if( rc==SQLITE_OK && wrflag ){ 003446 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){ 003447 rc = SQLITE_READONLY; 003448 }else{ 003449 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); 003450 if( rc==SQLITE_OK ){ 003451 rc = newDatabase(pBt); 003452 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){ 003453 /* if there was no transaction opened when this function was 003454 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error 003455 ** code to SQLITE_BUSY. */ 003456 rc = SQLITE_BUSY; 003457 } 003458 } 003459 } 003460 003461 if( rc!=SQLITE_OK ){ 003462 unlockBtreeIfUnused(pBt); 003463 } 003464 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 003465 btreeInvokeBusyHandler(pBt) ); 003466 sqlite3PagerResetLockTimeout(pBt->pPager); 003467 003468 if( rc==SQLITE_OK ){ 003469 if( p->inTrans==TRANS_NONE ){ 003470 pBt->nTransaction++; 003471 #ifndef SQLITE_OMIT_SHARED_CACHE 003472 if( p->sharable ){ 003473 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 003474 p->lock.eLock = READ_LOCK; 003475 p->lock.pNext = pBt->pLock; 003476 pBt->pLock = &p->lock; 003477 } 003478 #endif 003479 } 003480 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 003481 if( p->inTrans>pBt->inTransaction ){ 003482 pBt->inTransaction = p->inTrans; 003483 } 003484 if( wrflag ){ 003485 MemPage *pPage1 = pBt->pPage1; 003486 #ifndef SQLITE_OMIT_SHARED_CACHE 003487 assert( !pBt->pWriter ); 003488 pBt->pWriter = p; 003489 pBt->btsFlags &= ~BTS_EXCLUSIVE; 003490 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE; 003491 #endif 003492 003493 /* If the db-size header field is incorrect (as it may be if an old 003494 ** client has been writing the database file), update it now. Doing 003495 ** this sooner rather than later means the database size can safely 003496 ** re-read the database size from page 1 if a savepoint or transaction 003497 ** rollback occurs within the transaction. 003498 */ 003499 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 003500 rc = sqlite3PagerWrite(pPage1->pDbPage); 003501 if( rc==SQLITE_OK ){ 003502 put4byte(&pPage1->aData[28], pBt->nPage); 003503 } 003504 } 003505 } 003506 } 003507 003508 trans_begun: 003509 if( rc==SQLITE_OK ){ 003510 if( pSchemaVersion ){ 003511 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003512 } 003513 if( wrflag ){ 003514 /* This call makes sure that the pager has the correct number of 003515 ** open savepoints. If the second parameter is greater than 0 and 003516 ** the sub-journal is not already open, then it will be opened here. 003517 */ 003518 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 003519 } 003520 } 003521 003522 btreeIntegrity(p); 003523 sqlite3BtreeLeave(p); 003524 return rc; 003525 } 003526 003527 #ifndef SQLITE_OMIT_AUTOVACUUM 003528 003529 /* 003530 ** Set the pointer-map entries for all children of page pPage. Also, if 003531 ** pPage contains cells that point to overflow pages, set the pointer 003532 ** map entries for the overflow pages as well. 003533 */ 003534 static int setChildPtrmaps(MemPage *pPage){ 003535 int i; /* Counter variable */ 003536 int nCell; /* Number of cells in page pPage */ 003537 int rc; /* Return code */ 003538 BtShared *pBt = pPage->pBt; 003539 Pgno pgno = pPage->pgno; 003540 003541 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003542 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003543 if( rc!=SQLITE_OK ) return rc; 003544 nCell = pPage->nCell; 003545 003546 for(i=0; i<nCell; i++){ 003547 u8 *pCell = findCell(pPage, i); 003548 003549 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc); 003550 003551 if( !pPage->leaf ){ 003552 Pgno childPgno = get4byte(pCell); 003553 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003554 } 003555 } 003556 003557 if( !pPage->leaf ){ 003558 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 003559 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003560 } 003561 003562 return rc; 003563 } 003564 003565 /* 003566 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 003567 ** that it points to iTo. Parameter eType describes the type of pointer to 003568 ** be modified, as follows: 003569 ** 003570 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 003571 ** page of pPage. 003572 ** 003573 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 003574 ** page pointed to by one of the cells on pPage. 003575 ** 003576 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 003577 ** overflow page in the list. 003578 */ 003579 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 003580 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003581 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 003582 if( eType==PTRMAP_OVERFLOW2 ){ 003583 /* The pointer is always the first 4 bytes of the page in this case. */ 003584 if( get4byte(pPage->aData)!=iFrom ){ 003585 return SQLITE_CORRUPT_PAGE(pPage); 003586 } 003587 put4byte(pPage->aData, iTo); 003588 }else{ 003589 int i; 003590 int nCell; 003591 int rc; 003592 003593 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003594 if( rc ) return rc; 003595 nCell = pPage->nCell; 003596 003597 for(i=0; i<nCell; i++){ 003598 u8 *pCell = findCell(pPage, i); 003599 if( eType==PTRMAP_OVERFLOW1 ){ 003600 CellInfo info; 003601 pPage->xParseCell(pPage, pCell, &info); 003602 if( info.nLocal<info.nPayload ){ 003603 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){ 003604 return SQLITE_CORRUPT_PAGE(pPage); 003605 } 003606 if( iFrom==get4byte(pCell+info.nSize-4) ){ 003607 put4byte(pCell+info.nSize-4, iTo); 003608 break; 003609 } 003610 } 003611 }else{ 003612 if( get4byte(pCell)==iFrom ){ 003613 put4byte(pCell, iTo); 003614 break; 003615 } 003616 } 003617 } 003618 003619 if( i==nCell ){ 003620 if( eType!=PTRMAP_BTREE || 003621 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 003622 return SQLITE_CORRUPT_PAGE(pPage); 003623 } 003624 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 003625 } 003626 } 003627 return SQLITE_OK; 003628 } 003629 003630 003631 /* 003632 ** Move the open database page pDbPage to location iFreePage in the 003633 ** database. The pDbPage reference remains valid. 003634 ** 003635 ** The isCommit flag indicates that there is no need to remember that 003636 ** the journal needs to be sync()ed before database page pDbPage->pgno 003637 ** can be written to. The caller has already promised not to write to that 003638 ** page. 003639 */ 003640 static int relocatePage( 003641 BtShared *pBt, /* Btree */ 003642 MemPage *pDbPage, /* Open page to move */ 003643 u8 eType, /* Pointer map 'type' entry for pDbPage */ 003644 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 003645 Pgno iFreePage, /* The location to move pDbPage to */ 003646 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 003647 ){ 003648 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 003649 Pgno iDbPage = pDbPage->pgno; 003650 Pager *pPager = pBt->pPager; 003651 int rc; 003652 003653 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 003654 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 003655 assert( sqlite3_mutex_held(pBt->mutex) ); 003656 assert( pDbPage->pBt==pBt ); 003657 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT; 003658 003659 /* Move page iDbPage from its current location to page number iFreePage */ 003660 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 003661 iDbPage, iFreePage, iPtrPage, eType)); 003662 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 003663 if( rc!=SQLITE_OK ){ 003664 return rc; 003665 } 003666 pDbPage->pgno = iFreePage; 003667 003668 /* If pDbPage was a btree-page, then it may have child pages and/or cells 003669 ** that point to overflow pages. The pointer map entries for all these 003670 ** pages need to be changed. 003671 ** 003672 ** If pDbPage is an overflow page, then the first 4 bytes may store a 003673 ** pointer to a subsequent overflow page. If this is the case, then 003674 ** the pointer map needs to be updated for the subsequent overflow page. 003675 */ 003676 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 003677 rc = setChildPtrmaps(pDbPage); 003678 if( rc!=SQLITE_OK ){ 003679 return rc; 003680 } 003681 }else{ 003682 Pgno nextOvfl = get4byte(pDbPage->aData); 003683 if( nextOvfl!=0 ){ 003684 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 003685 if( rc!=SQLITE_OK ){ 003686 return rc; 003687 } 003688 } 003689 } 003690 003691 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 003692 ** that it points at iFreePage. Also fix the pointer map entry for 003693 ** iPtrPage. 003694 */ 003695 if( eType!=PTRMAP_ROOTPAGE ){ 003696 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 003697 if( rc!=SQLITE_OK ){ 003698 return rc; 003699 } 003700 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 003701 if( rc!=SQLITE_OK ){ 003702 releasePage(pPtrPage); 003703 return rc; 003704 } 003705 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 003706 releasePage(pPtrPage); 003707 if( rc==SQLITE_OK ){ 003708 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 003709 } 003710 } 003711 return rc; 003712 } 003713 003714 /* Forward declaration required by incrVacuumStep(). */ 003715 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 003716 003717 /* 003718 ** Perform a single step of an incremental-vacuum. If successful, return 003719 ** SQLITE_OK. If there is no work to do (and therefore no point in 003720 ** calling this function again), return SQLITE_DONE. Or, if an error 003721 ** occurs, return some other error code. 003722 ** 003723 ** More specifically, this function attempts to re-organize the database so 003724 ** that the last page of the file currently in use is no longer in use. 003725 ** 003726 ** Parameter nFin is the number of pages that this database would contain 003727 ** were this function called until it returns SQLITE_DONE. 003728 ** 003729 ** If the bCommit parameter is non-zero, this function assumes that the 003730 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 003731 ** or an error. bCommit is passed true for an auto-vacuum-on-commit 003732 ** operation, or false for an incremental vacuum. 003733 */ 003734 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ 003735 Pgno nFreeList; /* Number of pages still on the free-list */ 003736 int rc; 003737 003738 assert( sqlite3_mutex_held(pBt->mutex) ); 003739 assert( iLastPg>nFin ); 003740 003741 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 003742 u8 eType; 003743 Pgno iPtrPage; 003744 003745 nFreeList = get4byte(&pBt->pPage1->aData[36]); 003746 if( nFreeList==0 ){ 003747 return SQLITE_DONE; 003748 } 003749 003750 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 003751 if( rc!=SQLITE_OK ){ 003752 return rc; 003753 } 003754 if( eType==PTRMAP_ROOTPAGE ){ 003755 return SQLITE_CORRUPT_BKPT; 003756 } 003757 003758 if( eType==PTRMAP_FREEPAGE ){ 003759 if( bCommit==0 ){ 003760 /* Remove the page from the files free-list. This is not required 003761 ** if bCommit is non-zero. In that case, the free-list will be 003762 ** truncated to zero after this function returns, so it doesn't 003763 ** matter if it still contains some garbage entries. 003764 */ 003765 Pgno iFreePg; 003766 MemPage *pFreePg; 003767 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT); 003768 if( rc!=SQLITE_OK ){ 003769 return rc; 003770 } 003771 assert( iFreePg==iLastPg ); 003772 releasePage(pFreePg); 003773 } 003774 } else { 003775 Pgno iFreePg; /* Index of free page to move pLastPg to */ 003776 MemPage *pLastPg; 003777 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */ 003778 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */ 003779 003780 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 003781 if( rc!=SQLITE_OK ){ 003782 return rc; 003783 } 003784 003785 /* If bCommit is zero, this loop runs exactly once and page pLastPg 003786 ** is swapped with the first free page pulled off the free list. 003787 ** 003788 ** On the other hand, if bCommit is greater than zero, then keep 003789 ** looping until a free-page located within the first nFin pages 003790 ** of the file is found. 003791 */ 003792 if( bCommit==0 ){ 003793 eMode = BTALLOC_LE; 003794 iNear = nFin; 003795 } 003796 do { 003797 MemPage *pFreePg; 003798 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); 003799 if( rc!=SQLITE_OK ){ 003800 releasePage(pLastPg); 003801 return rc; 003802 } 003803 releasePage(pFreePg); 003804 }while( bCommit && iFreePg>nFin ); 003805 assert( iFreePg<iLastPg ); 003806 003807 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit); 003808 releasePage(pLastPg); 003809 if( rc!=SQLITE_OK ){ 003810 return rc; 003811 } 003812 } 003813 } 003814 003815 if( bCommit==0 ){ 003816 do { 003817 iLastPg--; 003818 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) ); 003819 pBt->bDoTruncate = 1; 003820 pBt->nPage = iLastPg; 003821 } 003822 return SQLITE_OK; 003823 } 003824 003825 /* 003826 ** The database opened by the first argument is an auto-vacuum database 003827 ** nOrig pages in size containing nFree free pages. Return the expected 003828 ** size of the database in pages following an auto-vacuum operation. 003829 */ 003830 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ 003831 int nEntry; /* Number of entries on one ptrmap page */ 003832 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 003833 Pgno nFin; /* Return value */ 003834 003835 nEntry = pBt->usableSize/5; 003836 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 003837 nFin = nOrig - nFree - nPtrmap; 003838 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 003839 nFin--; 003840 } 003841 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 003842 nFin--; 003843 } 003844 003845 return nFin; 003846 } 003847 003848 /* 003849 ** A write-transaction must be opened before calling this function. 003850 ** It performs a single unit of work towards an incremental vacuum. 003851 ** 003852 ** If the incremental vacuum is finished after this function has run, 003853 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 003854 ** SQLITE_OK is returned. Otherwise an SQLite error code. 003855 */ 003856 int sqlite3BtreeIncrVacuum(Btree *p){ 003857 int rc; 003858 BtShared *pBt = p->pBt; 003859 003860 sqlite3BtreeEnter(p); 003861 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 003862 if( !pBt->autoVacuum ){ 003863 rc = SQLITE_DONE; 003864 }else{ 003865 Pgno nOrig = btreePagecount(pBt); 003866 Pgno nFree = get4byte(&pBt->pPage1->aData[36]); 003867 Pgno nFin = finalDbSize(pBt, nOrig, nFree); 003868 003869 if( nOrig<nFin ){ 003870 rc = SQLITE_CORRUPT_BKPT; 003871 }else if( nFree>0 ){ 003872 rc = saveAllCursors(pBt, 0, 0); 003873 if( rc==SQLITE_OK ){ 003874 invalidateAllOverflowCache(pBt); 003875 rc = incrVacuumStep(pBt, nFin, nOrig, 0); 003876 } 003877 if( rc==SQLITE_OK ){ 003878 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 003879 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 003880 } 003881 }else{ 003882 rc = SQLITE_DONE; 003883 } 003884 } 003885 sqlite3BtreeLeave(p); 003886 return rc; 003887 } 003888 003889 /* 003890 ** This routine is called prior to sqlite3PagerCommit when a transaction 003891 ** is committed for an auto-vacuum database. 003892 ** 003893 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 003894 ** the database file should be truncated to during the commit process. 003895 ** i.e. the database has been reorganized so that only the first *pnTrunc 003896 ** pages are in use. 003897 */ 003898 static int autoVacuumCommit(BtShared *pBt){ 003899 int rc = SQLITE_OK; 003900 Pager *pPager = pBt->pPager; 003901 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); ) 003902 003903 assert( sqlite3_mutex_held(pBt->mutex) ); 003904 invalidateAllOverflowCache(pBt); 003905 assert(pBt->autoVacuum); 003906 if( !pBt->incrVacuum ){ 003907 Pgno nFin; /* Number of pages in database after autovacuuming */ 003908 Pgno nFree; /* Number of pages on the freelist initially */ 003909 Pgno iFree; /* The next page to be freed */ 003910 Pgno nOrig; /* Database size before freeing */ 003911 003912 nOrig = btreePagecount(pBt); 003913 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 003914 /* It is not possible to create a database for which the final page 003915 ** is either a pointer-map page or the pending-byte page. If one 003916 ** is encountered, this indicates corruption. 003917 */ 003918 return SQLITE_CORRUPT_BKPT; 003919 } 003920 003921 nFree = get4byte(&pBt->pPage1->aData[36]); 003922 nFin = finalDbSize(pBt, nOrig, nFree); 003923 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 003924 if( nFin<nOrig ){ 003925 rc = saveAllCursors(pBt, 0, 0); 003926 } 003927 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 003928 rc = incrVacuumStep(pBt, nFin, iFree, 1); 003929 } 003930 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 003931 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 003932 put4byte(&pBt->pPage1->aData[32], 0); 003933 put4byte(&pBt->pPage1->aData[36], 0); 003934 put4byte(&pBt->pPage1->aData[28], nFin); 003935 pBt->bDoTruncate = 1; 003936 pBt->nPage = nFin; 003937 } 003938 if( rc!=SQLITE_OK ){ 003939 sqlite3PagerRollback(pPager); 003940 } 003941 } 003942 003943 assert( nRef>=sqlite3PagerRefcount(pPager) ); 003944 return rc; 003945 } 003946 003947 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 003948 # define setChildPtrmaps(x) SQLITE_OK 003949 #endif 003950 003951 /* 003952 ** This routine does the first phase of a two-phase commit. This routine 003953 ** causes a rollback journal to be created (if it does not already exist) 003954 ** and populated with enough information so that if a power loss occurs 003955 ** the database can be restored to its original state by playing back 003956 ** the journal. Then the contents of the journal are flushed out to 003957 ** the disk. After the journal is safely on oxide, the changes to the 003958 ** database are written into the database file and flushed to oxide. 003959 ** At the end of this call, the rollback journal still exists on the 003960 ** disk and we are still holding all locks, so the transaction has not 003961 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 003962 ** commit process. 003963 ** 003964 ** This call is a no-op if no write-transaction is currently active on pBt. 003965 ** 003966 ** Otherwise, sync the database file for the btree pBt. zMaster points to 003967 ** the name of a master journal file that should be written into the 003968 ** individual journal file, or is NULL, indicating no master journal file 003969 ** (single database transaction). 003970 ** 003971 ** When this is called, the master journal should already have been 003972 ** created, populated with this journal pointer and synced to disk. 003973 ** 003974 ** Once this is routine has returned, the only thing required to commit 003975 ** the write-transaction for this database file is to delete the journal. 003976 */ 003977 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 003978 int rc = SQLITE_OK; 003979 if( p->inTrans==TRANS_WRITE ){ 003980 BtShared *pBt = p->pBt; 003981 sqlite3BtreeEnter(p); 003982 #ifndef SQLITE_OMIT_AUTOVACUUM 003983 if( pBt->autoVacuum ){ 003984 rc = autoVacuumCommit(pBt); 003985 if( rc!=SQLITE_OK ){ 003986 sqlite3BtreeLeave(p); 003987 return rc; 003988 } 003989 } 003990 if( pBt->bDoTruncate ){ 003991 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage); 003992 } 003993 #endif 003994 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); 003995 sqlite3BtreeLeave(p); 003996 } 003997 return rc; 003998 } 003999 004000 /* 004001 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 004002 ** at the conclusion of a transaction. 004003 */ 004004 static void btreeEndTransaction(Btree *p){ 004005 BtShared *pBt = p->pBt; 004006 sqlite3 *db = p->db; 004007 assert( sqlite3BtreeHoldsMutex(p) ); 004008 004009 #ifndef SQLITE_OMIT_AUTOVACUUM 004010 pBt->bDoTruncate = 0; 004011 #endif 004012 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){ 004013 /* If there are other active statements that belong to this database 004014 ** handle, downgrade to a read-only transaction. The other statements 004015 ** may still be reading from the database. */ 004016 downgradeAllSharedCacheTableLocks(p); 004017 p->inTrans = TRANS_READ; 004018 }else{ 004019 /* If the handle had any kind of transaction open, decrement the 004020 ** transaction count of the shared btree. If the transaction count 004021 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 004022 ** call below will unlock the pager. */ 004023 if( p->inTrans!=TRANS_NONE ){ 004024 clearAllSharedCacheTableLocks(p); 004025 pBt->nTransaction--; 004026 if( 0==pBt->nTransaction ){ 004027 pBt->inTransaction = TRANS_NONE; 004028 } 004029 } 004030 004031 /* Set the current transaction state to TRANS_NONE and unlock the 004032 ** pager if this call closed the only read or write transaction. */ 004033 p->inTrans = TRANS_NONE; 004034 unlockBtreeIfUnused(pBt); 004035 } 004036 004037 btreeIntegrity(p); 004038 } 004039 004040 /* 004041 ** Commit the transaction currently in progress. 004042 ** 004043 ** This routine implements the second phase of a 2-phase commit. The 004044 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 004045 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 004046 ** routine did all the work of writing information out to disk and flushing the 004047 ** contents so that they are written onto the disk platter. All this 004048 ** routine has to do is delete or truncate or zero the header in the 004049 ** the rollback journal (which causes the transaction to commit) and 004050 ** drop locks. 004051 ** 004052 ** Normally, if an error occurs while the pager layer is attempting to 004053 ** finalize the underlying journal file, this function returns an error and 004054 ** the upper layer will attempt a rollback. However, if the second argument 004055 ** is non-zero then this b-tree transaction is part of a multi-file 004056 ** transaction. In this case, the transaction has already been committed 004057 ** (by deleting a master journal file) and the caller will ignore this 004058 ** functions return code. So, even if an error occurs in the pager layer, 004059 ** reset the b-tree objects internal state to indicate that the write 004060 ** transaction has been closed. This is quite safe, as the pager will have 004061 ** transitioned to the error state. 004062 ** 004063 ** This will release the write lock on the database file. If there 004064 ** are no active cursors, it also releases the read lock. 004065 */ 004066 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 004067 004068 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 004069 sqlite3BtreeEnter(p); 004070 btreeIntegrity(p); 004071 004072 /* If the handle has a write-transaction open, commit the shared-btrees 004073 ** transaction and set the shared state to TRANS_READ. 004074 */ 004075 if( p->inTrans==TRANS_WRITE ){ 004076 int rc; 004077 BtShared *pBt = p->pBt; 004078 assert( pBt->inTransaction==TRANS_WRITE ); 004079 assert( pBt->nTransaction>0 ); 004080 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 004081 if( rc!=SQLITE_OK && bCleanup==0 ){ 004082 sqlite3BtreeLeave(p); 004083 return rc; 004084 } 004085 p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */ 004086 pBt->inTransaction = TRANS_READ; 004087 btreeClearHasContent(pBt); 004088 } 004089 004090 btreeEndTransaction(p); 004091 sqlite3BtreeLeave(p); 004092 return SQLITE_OK; 004093 } 004094 004095 /* 004096 ** Do both phases of a commit. 004097 */ 004098 int sqlite3BtreeCommit(Btree *p){ 004099 int rc; 004100 sqlite3BtreeEnter(p); 004101 rc = sqlite3BtreeCommitPhaseOne(p, 0); 004102 if( rc==SQLITE_OK ){ 004103 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 004104 } 004105 sqlite3BtreeLeave(p); 004106 return rc; 004107 } 004108 004109 /* 004110 ** This routine sets the state to CURSOR_FAULT and the error 004111 ** code to errCode for every cursor on any BtShared that pBtree 004112 ** references. Or if the writeOnly flag is set to 1, then only 004113 ** trip write cursors and leave read cursors unchanged. 004114 ** 004115 ** Every cursor is a candidate to be tripped, including cursors 004116 ** that belong to other database connections that happen to be 004117 ** sharing the cache with pBtree. 004118 ** 004119 ** This routine gets called when a rollback occurs. If the writeOnly 004120 ** flag is true, then only write-cursors need be tripped - read-only 004121 ** cursors save their current positions so that they may continue 004122 ** following the rollback. Or, if writeOnly is false, all cursors are 004123 ** tripped. In general, writeOnly is false if the transaction being 004124 ** rolled back modified the database schema. In this case b-tree root 004125 ** pages may be moved or deleted from the database altogether, making 004126 ** it unsafe for read cursors to continue. 004127 ** 004128 ** If the writeOnly flag is true and an error is encountered while 004129 ** saving the current position of a read-only cursor, all cursors, 004130 ** including all read-cursors are tripped. 004131 ** 004132 ** SQLITE_OK is returned if successful, or if an error occurs while 004133 ** saving a cursor position, an SQLite error code. 004134 */ 004135 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){ 004136 BtCursor *p; 004137 int rc = SQLITE_OK; 004138 004139 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 ); 004140 if( pBtree ){ 004141 sqlite3BtreeEnter(pBtree); 004142 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 004143 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){ 004144 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 004145 rc = saveCursorPosition(p); 004146 if( rc!=SQLITE_OK ){ 004147 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0); 004148 break; 004149 } 004150 } 004151 }else{ 004152 sqlite3BtreeClearCursor(p); 004153 p->eState = CURSOR_FAULT; 004154 p->skipNext = errCode; 004155 } 004156 btreeReleaseAllCursorPages(p); 004157 } 004158 sqlite3BtreeLeave(pBtree); 004159 } 004160 return rc; 004161 } 004162 004163 /* 004164 ** Set the pBt->nPage field correctly, according to the current 004165 ** state of the database. Assume pBt->pPage1 is valid. 004166 */ 004167 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){ 004168 int nPage = get4byte(&pPage1->aData[28]); 004169 testcase( nPage==0 ); 004170 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 004171 testcase( pBt->nPage!=nPage ); 004172 pBt->nPage = nPage; 004173 } 004174 004175 /* 004176 ** Rollback the transaction in progress. 004177 ** 004178 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped). 004179 ** Only write cursors are tripped if writeOnly is true but all cursors are 004180 ** tripped if writeOnly is false. Any attempt to use 004181 ** a tripped cursor will result in an error. 004182 ** 004183 ** This will release the write lock on the database file. If there 004184 ** are no active cursors, it also releases the read lock. 004185 */ 004186 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ 004187 int rc; 004188 BtShared *pBt = p->pBt; 004189 MemPage *pPage1; 004190 004191 assert( writeOnly==1 || writeOnly==0 ); 004192 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK ); 004193 sqlite3BtreeEnter(p); 004194 if( tripCode==SQLITE_OK ){ 004195 rc = tripCode = saveAllCursors(pBt, 0, 0); 004196 if( rc ) writeOnly = 0; 004197 }else{ 004198 rc = SQLITE_OK; 004199 } 004200 if( tripCode ){ 004201 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly); 004202 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) ); 004203 if( rc2!=SQLITE_OK ) rc = rc2; 004204 } 004205 btreeIntegrity(p); 004206 004207 if( p->inTrans==TRANS_WRITE ){ 004208 int rc2; 004209 004210 assert( TRANS_WRITE==pBt->inTransaction ); 004211 rc2 = sqlite3PagerRollback(pBt->pPager); 004212 if( rc2!=SQLITE_OK ){ 004213 rc = rc2; 004214 } 004215 004216 /* The rollback may have destroyed the pPage1->aData value. So 004217 ** call btreeGetPage() on page 1 again to make 004218 ** sure pPage1->aData is set correctly. */ 004219 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 004220 btreeSetNPage(pBt, pPage1); 004221 releasePageOne(pPage1); 004222 } 004223 assert( countValidCursors(pBt, 1)==0 ); 004224 pBt->inTransaction = TRANS_READ; 004225 btreeClearHasContent(pBt); 004226 } 004227 004228 btreeEndTransaction(p); 004229 sqlite3BtreeLeave(p); 004230 return rc; 004231 } 004232 004233 /* 004234 ** Start a statement subtransaction. The subtransaction can be rolled 004235 ** back independently of the main transaction. You must start a transaction 004236 ** before starting a subtransaction. The subtransaction is ended automatically 004237 ** if the main transaction commits or rolls back. 004238 ** 004239 ** Statement subtransactions are used around individual SQL statements 004240 ** that are contained within a BEGIN...COMMIT block. If a constraint 004241 ** error occurs within the statement, the effect of that one statement 004242 ** can be rolled back without having to rollback the entire transaction. 004243 ** 004244 ** A statement sub-transaction is implemented as an anonymous savepoint. The 004245 ** value passed as the second parameter is the total number of savepoints, 004246 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 004247 ** are no active savepoints and no other statement-transactions open, 004248 ** iStatement is 1. This anonymous savepoint can be released or rolled back 004249 ** using the sqlite3BtreeSavepoint() function. 004250 */ 004251 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 004252 int rc; 004253 BtShared *pBt = p->pBt; 004254 sqlite3BtreeEnter(p); 004255 assert( p->inTrans==TRANS_WRITE ); 004256 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004257 assert( iStatement>0 ); 004258 assert( iStatement>p->db->nSavepoint ); 004259 assert( pBt->inTransaction==TRANS_WRITE ); 004260 /* At the pager level, a statement transaction is a savepoint with 004261 ** an index greater than all savepoints created explicitly using 004262 ** SQL statements. It is illegal to open, release or rollback any 004263 ** such savepoints while the statement transaction savepoint is active. 004264 */ 004265 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 004266 sqlite3BtreeLeave(p); 004267 return rc; 004268 } 004269 004270 /* 004271 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 004272 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 004273 ** savepoint identified by parameter iSavepoint, depending on the value 004274 ** of op. 004275 ** 004276 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 004277 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 004278 ** contents of the entire transaction are rolled back. This is different 004279 ** from a normal transaction rollback, as no locks are released and the 004280 ** transaction remains open. 004281 */ 004282 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 004283 int rc = SQLITE_OK; 004284 if( p && p->inTrans==TRANS_WRITE ){ 004285 BtShared *pBt = p->pBt; 004286 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 004287 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 004288 sqlite3BtreeEnter(p); 004289 if( op==SAVEPOINT_ROLLBACK ){ 004290 rc = saveAllCursors(pBt, 0, 0); 004291 } 004292 if( rc==SQLITE_OK ){ 004293 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 004294 } 004295 if( rc==SQLITE_OK ){ 004296 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){ 004297 pBt->nPage = 0; 004298 } 004299 rc = newDatabase(pBt); 004300 btreeSetNPage(pBt, pBt->pPage1); 004301 004302 /* pBt->nPage might be zero if the database was corrupt when 004303 ** the transaction was started. Otherwise, it must be at least 1. */ 004304 assert( CORRUPT_DB || pBt->nPage>0 ); 004305 } 004306 sqlite3BtreeLeave(p); 004307 } 004308 return rc; 004309 } 004310 004311 /* 004312 ** Create a new cursor for the BTree whose root is on the page 004313 ** iTable. If a read-only cursor is requested, it is assumed that 004314 ** the caller already has at least a read-only transaction open 004315 ** on the database already. If a write-cursor is requested, then 004316 ** the caller is assumed to have an open write transaction. 004317 ** 004318 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only 004319 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor 004320 ** can be used for reading or for writing if other conditions for writing 004321 ** are also met. These are the conditions that must be met in order 004322 ** for writing to be allowed: 004323 ** 004324 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR 004325 ** 004326 ** 2: Other database connections that share the same pager cache 004327 ** but which are not in the READ_UNCOMMITTED state may not have 004328 ** cursors open with wrFlag==0 on the same table. Otherwise 004329 ** the changes made by this write cursor would be visible to 004330 ** the read cursors in the other database connection. 004331 ** 004332 ** 3: The database must be writable (not on read-only media) 004333 ** 004334 ** 4: There must be an active transaction. 004335 ** 004336 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR 004337 ** is set. If FORDELETE is set, that is a hint to the implementation that 004338 ** this cursor will only be used to seek to and delete entries of an index 004339 ** as part of a larger DELETE statement. The FORDELETE hint is not used by 004340 ** this implementation. But in a hypothetical alternative storage engine 004341 ** in which index entries are automatically deleted when corresponding table 004342 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE 004343 ** operations on this cursor can be no-ops and all READ operations can 004344 ** return a null row (2-bytes: 0x01 0x00). 004345 ** 004346 ** No checking is done to make sure that page iTable really is the 004347 ** root page of a b-tree. If it is not, then the cursor acquired 004348 ** will not work correctly. 004349 ** 004350 ** It is assumed that the sqlite3BtreeCursorZero() has been called 004351 ** on pCur to initialize the memory space prior to invoking this routine. 004352 */ 004353 static int btreeCursor( 004354 Btree *p, /* The btree */ 004355 int iTable, /* Root page of table to open */ 004356 int wrFlag, /* 1 to write. 0 read-only */ 004357 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004358 BtCursor *pCur /* Space for new cursor */ 004359 ){ 004360 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 004361 BtCursor *pX; /* Looping over other all cursors */ 004362 004363 assert( sqlite3BtreeHoldsMutex(p) ); 004364 assert( wrFlag==0 004365 || wrFlag==BTREE_WRCSR 004366 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 004367 ); 004368 004369 /* The following assert statements verify that if this is a sharable 004370 ** b-tree database, the connection is holding the required table locks, 004371 ** and that no other connection has any open cursor that conflicts with 004372 ** this lock. The iTable<1 term disables the check for corrupt schemas. */ 004373 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) 004374 || iTable<1 ); 004375 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 004376 004377 /* Assert that the caller has opened the required transaction. */ 004378 assert( p->inTrans>TRANS_NONE ); 004379 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 004380 assert( pBt->pPage1 && pBt->pPage1->aData ); 004381 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004382 004383 if( wrFlag ){ 004384 allocateTempSpace(pBt); 004385 if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT; 004386 } 004387 if( iTable<=1 ){ 004388 if( iTable<1 ){ 004389 return SQLITE_CORRUPT_BKPT; 004390 }else if( btreePagecount(pBt)==0 ){ 004391 assert( wrFlag==0 ); 004392 iTable = 0; 004393 } 004394 } 004395 004396 /* Now that no other errors can occur, finish filling in the BtCursor 004397 ** variables and link the cursor into the BtShared list. */ 004398 pCur->pgnoRoot = (Pgno)iTable; 004399 pCur->iPage = -1; 004400 pCur->pKeyInfo = pKeyInfo; 004401 pCur->pBtree = p; 004402 pCur->pBt = pBt; 004403 pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0; 004404 pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY; 004405 /* If there are two or more cursors on the same btree, then all such 004406 ** cursors *must* have the BTCF_Multiple flag set. */ 004407 for(pX=pBt->pCursor; pX; pX=pX->pNext){ 004408 if( pX->pgnoRoot==(Pgno)iTable ){ 004409 pX->curFlags |= BTCF_Multiple; 004410 pCur->curFlags |= BTCF_Multiple; 004411 } 004412 } 004413 pCur->pNext = pBt->pCursor; 004414 pBt->pCursor = pCur; 004415 pCur->eState = CURSOR_INVALID; 004416 return SQLITE_OK; 004417 } 004418 static int btreeCursorWithLock( 004419 Btree *p, /* The btree */ 004420 int iTable, /* Root page of table to open */ 004421 int wrFlag, /* 1 to write. 0 read-only */ 004422 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004423 BtCursor *pCur /* Space for new cursor */ 004424 ){ 004425 int rc; 004426 sqlite3BtreeEnter(p); 004427 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004428 sqlite3BtreeLeave(p); 004429 return rc; 004430 } 004431 int sqlite3BtreeCursor( 004432 Btree *p, /* The btree */ 004433 int iTable, /* Root page of table to open */ 004434 int wrFlag, /* 1 to write. 0 read-only */ 004435 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 004436 BtCursor *pCur /* Write new cursor here */ 004437 ){ 004438 if( p->sharable ){ 004439 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur); 004440 }else{ 004441 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004442 } 004443 } 004444 004445 /* 004446 ** Return the size of a BtCursor object in bytes. 004447 ** 004448 ** This interfaces is needed so that users of cursors can preallocate 004449 ** sufficient storage to hold a cursor. The BtCursor object is opaque 004450 ** to users so they cannot do the sizeof() themselves - they must call 004451 ** this routine. 004452 */ 004453 int sqlite3BtreeCursorSize(void){ 004454 return ROUND8(sizeof(BtCursor)); 004455 } 004456 004457 /* 004458 ** Initialize memory that will be converted into a BtCursor object. 004459 ** 004460 ** The simple approach here would be to memset() the entire object 004461 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 004462 ** do not need to be zeroed and they are large, so we can save a lot 004463 ** of run-time by skipping the initialization of those elements. 004464 */ 004465 void sqlite3BtreeCursorZero(BtCursor *p){ 004466 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT)); 004467 } 004468 004469 /* 004470 ** Close a cursor. The read lock on the database file is released 004471 ** when the last cursor is closed. 004472 */ 004473 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 004474 Btree *pBtree = pCur->pBtree; 004475 if( pBtree ){ 004476 BtShared *pBt = pCur->pBt; 004477 sqlite3BtreeEnter(pBtree); 004478 assert( pBt->pCursor!=0 ); 004479 if( pBt->pCursor==pCur ){ 004480 pBt->pCursor = pCur->pNext; 004481 }else{ 004482 BtCursor *pPrev = pBt->pCursor; 004483 do{ 004484 if( pPrev->pNext==pCur ){ 004485 pPrev->pNext = pCur->pNext; 004486 break; 004487 } 004488 pPrev = pPrev->pNext; 004489 }while( ALWAYS(pPrev) ); 004490 } 004491 btreeReleaseAllCursorPages(pCur); 004492 unlockBtreeIfUnused(pBt); 004493 sqlite3_free(pCur->aOverflow); 004494 sqlite3_free(pCur->pKey); 004495 sqlite3BtreeLeave(pBtree); 004496 pCur->pBtree = 0; 004497 } 004498 return SQLITE_OK; 004499 } 004500 004501 /* 004502 ** Make sure the BtCursor* given in the argument has a valid 004503 ** BtCursor.info structure. If it is not already valid, call 004504 ** btreeParseCell() to fill it in. 004505 ** 004506 ** BtCursor.info is a cache of the information in the current cell. 004507 ** Using this cache reduces the number of calls to btreeParseCell(). 004508 */ 004509 #ifndef NDEBUG 004510 static int cellInfoEqual(CellInfo *a, CellInfo *b){ 004511 if( a->nKey!=b->nKey ) return 0; 004512 if( a->pPayload!=b->pPayload ) return 0; 004513 if( a->nPayload!=b->nPayload ) return 0; 004514 if( a->nLocal!=b->nLocal ) return 0; 004515 if( a->nSize!=b->nSize ) return 0; 004516 return 1; 004517 } 004518 static void assertCellInfo(BtCursor *pCur){ 004519 CellInfo info; 004520 memset(&info, 0, sizeof(info)); 004521 btreeParseCell(pCur->pPage, pCur->ix, &info); 004522 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) ); 004523 } 004524 #else 004525 #define assertCellInfo(x) 004526 #endif 004527 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){ 004528 if( pCur->info.nSize==0 ){ 004529 pCur->curFlags |= BTCF_ValidNKey; 004530 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info); 004531 }else{ 004532 assertCellInfo(pCur); 004533 } 004534 } 004535 004536 #ifndef NDEBUG /* The next routine used only within assert() statements */ 004537 /* 004538 ** Return true if the given BtCursor is valid. A valid cursor is one 004539 ** that is currently pointing to a row in a (non-empty) table. 004540 ** This is a verification routine is used only within assert() statements. 004541 */ 004542 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 004543 return pCur && pCur->eState==CURSOR_VALID; 004544 } 004545 #endif /* NDEBUG */ 004546 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){ 004547 assert( pCur!=0 ); 004548 return pCur->eState==CURSOR_VALID; 004549 } 004550 004551 /* 004552 ** Return the value of the integer key or "rowid" for a table btree. 004553 ** This routine is only valid for a cursor that is pointing into a 004554 ** ordinary table btree. If the cursor points to an index btree or 004555 ** is invalid, the result of this routine is undefined. 004556 */ 004557 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){ 004558 assert( cursorHoldsMutex(pCur) ); 004559 assert( pCur->eState==CURSOR_VALID ); 004560 assert( pCur->curIntKey ); 004561 getCellInfo(pCur); 004562 return pCur->info.nKey; 004563 } 004564 004565 #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC 004566 /* 004567 ** Return the offset into the database file for the start of the 004568 ** payload to which the cursor is pointing. 004569 */ 004570 i64 sqlite3BtreeOffset(BtCursor *pCur){ 004571 assert( cursorHoldsMutex(pCur) ); 004572 assert( pCur->eState==CURSOR_VALID ); 004573 getCellInfo(pCur); 004574 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) + 004575 (i64)(pCur->info.pPayload - pCur->pPage->aData); 004576 } 004577 #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */ 004578 004579 /* 004580 ** Return the number of bytes of payload for the entry that pCur is 004581 ** currently pointing to. For table btrees, this will be the amount 004582 ** of data. For index btrees, this will be the size of the key. 004583 ** 004584 ** The caller must guarantee that the cursor is pointing to a non-NULL 004585 ** valid entry. In other words, the calling procedure must guarantee 004586 ** that the cursor has Cursor.eState==CURSOR_VALID. 004587 */ 004588 u32 sqlite3BtreePayloadSize(BtCursor *pCur){ 004589 assert( cursorHoldsMutex(pCur) ); 004590 assert( pCur->eState==CURSOR_VALID ); 004591 getCellInfo(pCur); 004592 return pCur->info.nPayload; 004593 } 004594 004595 /* 004596 ** Return an upper bound on the size of any record for the table 004597 ** that the cursor is pointing into. 004598 ** 004599 ** This is an optimization. Everything will still work if this 004600 ** routine always returns 2147483647 (which is the largest record 004601 ** that SQLite can handle) or more. But returning a smaller value might 004602 ** prevent large memory allocations when trying to interpret a 004603 ** corrupt datrabase. 004604 ** 004605 ** The current implementation merely returns the size of the underlying 004606 ** database file. 004607 */ 004608 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){ 004609 assert( cursorHoldsMutex(pCur) ); 004610 assert( pCur->eState==CURSOR_VALID ); 004611 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage; 004612 } 004613 004614 /* 004615 ** Given the page number of an overflow page in the database (parameter 004616 ** ovfl), this function finds the page number of the next page in the 004617 ** linked list of overflow pages. If possible, it uses the auto-vacuum 004618 ** pointer-map data instead of reading the content of page ovfl to do so. 004619 ** 004620 ** If an error occurs an SQLite error code is returned. Otherwise: 004621 ** 004622 ** The page number of the next overflow page in the linked list is 004623 ** written to *pPgnoNext. If page ovfl is the last page in its linked 004624 ** list, *pPgnoNext is set to zero. 004625 ** 004626 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 004627 ** to page number pOvfl was obtained, then *ppPage is set to point to that 004628 ** reference. It is the responsibility of the caller to call releasePage() 004629 ** on *ppPage to free the reference. In no reference was obtained (because 004630 ** the pointer-map was used to obtain the value for *pPgnoNext), then 004631 ** *ppPage is set to zero. 004632 */ 004633 static int getOverflowPage( 004634 BtShared *pBt, /* The database file */ 004635 Pgno ovfl, /* Current overflow page number */ 004636 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 004637 Pgno *pPgnoNext /* OUT: Next overflow page number */ 004638 ){ 004639 Pgno next = 0; 004640 MemPage *pPage = 0; 004641 int rc = SQLITE_OK; 004642 004643 assert( sqlite3_mutex_held(pBt->mutex) ); 004644 assert(pPgnoNext); 004645 004646 #ifndef SQLITE_OMIT_AUTOVACUUM 004647 /* Try to find the next page in the overflow list using the 004648 ** autovacuum pointer-map pages. Guess that the next page in 004649 ** the overflow list is page number (ovfl+1). If that guess turns 004650 ** out to be wrong, fall back to loading the data of page 004651 ** number ovfl to determine the next page number. 004652 */ 004653 if( pBt->autoVacuum ){ 004654 Pgno pgno; 004655 Pgno iGuess = ovfl+1; 004656 u8 eType; 004657 004658 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 004659 iGuess++; 004660 } 004661 004662 if( iGuess<=btreePagecount(pBt) ){ 004663 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 004664 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 004665 next = iGuess; 004666 rc = SQLITE_DONE; 004667 } 004668 } 004669 } 004670 #endif 004671 004672 assert( next==0 || rc==SQLITE_DONE ); 004673 if( rc==SQLITE_OK ){ 004674 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0); 004675 assert( rc==SQLITE_OK || pPage==0 ); 004676 if( rc==SQLITE_OK ){ 004677 next = get4byte(pPage->aData); 004678 } 004679 } 004680 004681 *pPgnoNext = next; 004682 if( ppPage ){ 004683 *ppPage = pPage; 004684 }else{ 004685 releasePage(pPage); 004686 } 004687 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 004688 } 004689 004690 /* 004691 ** Copy data from a buffer to a page, or from a page to a buffer. 004692 ** 004693 ** pPayload is a pointer to data stored on database page pDbPage. 004694 ** If argument eOp is false, then nByte bytes of data are copied 004695 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 004696 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 004697 ** of data are copied from the buffer pBuf to pPayload. 004698 ** 004699 ** SQLITE_OK is returned on success, otherwise an error code. 004700 */ 004701 static int copyPayload( 004702 void *pPayload, /* Pointer to page data */ 004703 void *pBuf, /* Pointer to buffer */ 004704 int nByte, /* Number of bytes to copy */ 004705 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 004706 DbPage *pDbPage /* Page containing pPayload */ 004707 ){ 004708 if( eOp ){ 004709 /* Copy data from buffer to page (a write operation) */ 004710 int rc = sqlite3PagerWrite(pDbPage); 004711 if( rc!=SQLITE_OK ){ 004712 return rc; 004713 } 004714 memcpy(pPayload, pBuf, nByte); 004715 }else{ 004716 /* Copy data from page to buffer (a read operation) */ 004717 memcpy(pBuf, pPayload, nByte); 004718 } 004719 return SQLITE_OK; 004720 } 004721 004722 /* 004723 ** This function is used to read or overwrite payload information 004724 ** for the entry that the pCur cursor is pointing to. The eOp 004725 ** argument is interpreted as follows: 004726 ** 004727 ** 0: The operation is a read. Populate the overflow cache. 004728 ** 1: The operation is a write. Populate the overflow cache. 004729 ** 004730 ** A total of "amt" bytes are read or written beginning at "offset". 004731 ** Data is read to or from the buffer pBuf. 004732 ** 004733 ** The content being read or written might appear on the main page 004734 ** or be scattered out on multiple overflow pages. 004735 ** 004736 ** If the current cursor entry uses one or more overflow pages 004737 ** this function may allocate space for and lazily populate 004738 ** the overflow page-list cache array (BtCursor.aOverflow). 004739 ** Subsequent calls use this cache to make seeking to the supplied offset 004740 ** more efficient. 004741 ** 004742 ** Once an overflow page-list cache has been allocated, it must be 004743 ** invalidated if some other cursor writes to the same table, or if 004744 ** the cursor is moved to a different row. Additionally, in auto-vacuum 004745 ** mode, the following events may invalidate an overflow page-list cache. 004746 ** 004747 ** * An incremental vacuum, 004748 ** * A commit in auto_vacuum="full" mode, 004749 ** * Creating a table (may require moving an overflow page). 004750 */ 004751 static int accessPayload( 004752 BtCursor *pCur, /* Cursor pointing to entry to read from */ 004753 u32 offset, /* Begin reading this far into payload */ 004754 u32 amt, /* Read this many bytes */ 004755 unsigned char *pBuf, /* Write the bytes into this buffer */ 004756 int eOp /* zero to read. non-zero to write. */ 004757 ){ 004758 unsigned char *aPayload; 004759 int rc = SQLITE_OK; 004760 int iIdx = 0; 004761 MemPage *pPage = pCur->pPage; /* Btree page of current entry */ 004762 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 004763 #ifdef SQLITE_DIRECT_OVERFLOW_READ 004764 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */ 004765 #endif 004766 004767 assert( pPage ); 004768 assert( eOp==0 || eOp==1 ); 004769 assert( pCur->eState==CURSOR_VALID ); 004770 assert( pCur->ix<pPage->nCell ); 004771 assert( cursorHoldsMutex(pCur) ); 004772 004773 getCellInfo(pCur); 004774 aPayload = pCur->info.pPayload; 004775 assert( offset+amt <= pCur->info.nPayload ); 004776 004777 assert( aPayload > pPage->aData ); 004778 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){ 004779 /* Trying to read or write past the end of the data is an error. The 004780 ** conditional above is really: 004781 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 004782 ** but is recast into its current form to avoid integer overflow problems 004783 */ 004784 return SQLITE_CORRUPT_PAGE(pPage); 004785 } 004786 004787 /* Check if data must be read/written to/from the btree page itself. */ 004788 if( offset<pCur->info.nLocal ){ 004789 int a = amt; 004790 if( a+offset>pCur->info.nLocal ){ 004791 a = pCur->info.nLocal - offset; 004792 } 004793 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 004794 offset = 0; 004795 pBuf += a; 004796 amt -= a; 004797 }else{ 004798 offset -= pCur->info.nLocal; 004799 } 004800 004801 004802 if( rc==SQLITE_OK && amt>0 ){ 004803 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 004804 Pgno nextPage; 004805 004806 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 004807 004808 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now. 004809 ** 004810 ** The aOverflow[] array is sized at one entry for each overflow page 004811 ** in the overflow chain. The page number of the first overflow page is 004812 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array 004813 ** means "not yet known" (the cache is lazily populated). 004814 */ 004815 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){ 004816 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 004817 if( pCur->aOverflow==0 004818 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow) 004819 ){ 004820 Pgno *aNew = (Pgno*)sqlite3Realloc( 004821 pCur->aOverflow, nOvfl*2*sizeof(Pgno) 004822 ); 004823 if( aNew==0 ){ 004824 return SQLITE_NOMEM_BKPT; 004825 }else{ 004826 pCur->aOverflow = aNew; 004827 } 004828 } 004829 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno)); 004830 pCur->curFlags |= BTCF_ValidOvfl; 004831 }else{ 004832 /* If the overflow page-list cache has been allocated and the 004833 ** entry for the first required overflow page is valid, skip 004834 ** directly to it. 004835 */ 004836 if( pCur->aOverflow[offset/ovflSize] ){ 004837 iIdx = (offset/ovflSize); 004838 nextPage = pCur->aOverflow[iIdx]; 004839 offset = (offset%ovflSize); 004840 } 004841 } 004842 004843 assert( rc==SQLITE_OK && amt>0 ); 004844 while( nextPage ){ 004845 /* If required, populate the overflow page-list cache. */ 004846 assert( pCur->aOverflow[iIdx]==0 004847 || pCur->aOverflow[iIdx]==nextPage 004848 || CORRUPT_DB ); 004849 pCur->aOverflow[iIdx] = nextPage; 004850 004851 if( offset>=ovflSize ){ 004852 /* The only reason to read this page is to obtain the page 004853 ** number for the next page in the overflow chain. The page 004854 ** data is not required. So first try to lookup the overflow 004855 ** page-list cache, if any, then fall back to the getOverflowPage() 004856 ** function. 004857 */ 004858 assert( pCur->curFlags & BTCF_ValidOvfl ); 004859 assert( pCur->pBtree->db==pBt->db ); 004860 if( pCur->aOverflow[iIdx+1] ){ 004861 nextPage = pCur->aOverflow[iIdx+1]; 004862 }else{ 004863 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 004864 } 004865 offset -= ovflSize; 004866 }else{ 004867 /* Need to read this page properly. It contains some of the 004868 ** range of data that is being read (eOp==0) or written (eOp!=0). 004869 */ 004870 int a = amt; 004871 if( a + offset > ovflSize ){ 004872 a = ovflSize - offset; 004873 } 004874 004875 #ifdef SQLITE_DIRECT_OVERFLOW_READ 004876 /* If all the following are true: 004877 ** 004878 ** 1) this is a read operation, and 004879 ** 2) data is required from the start of this overflow page, and 004880 ** 3) there are no dirty pages in the page-cache 004881 ** 4) the database is file-backed, and 004882 ** 5) the page is not in the WAL file 004883 ** 6) at least 4 bytes have already been read into the output buffer 004884 ** 004885 ** then data can be read directly from the database file into the 004886 ** output buffer, bypassing the page-cache altogether. This speeds 004887 ** up loading large records that span many overflow pages. 004888 */ 004889 if( eOp==0 /* (1) */ 004890 && offset==0 /* (2) */ 004891 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */ 004892 && &pBuf[-4]>=pBufStart /* (6) */ 004893 ){ 004894 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager); 004895 u8 aSave[4]; 004896 u8 *aWrite = &pBuf[-4]; 004897 assert( aWrite>=pBufStart ); /* due to (6) */ 004898 memcpy(aSave, aWrite, 4); 004899 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); 004900 if( rc && nextPage>pBt->nPage ) rc = SQLITE_CORRUPT_BKPT; 004901 nextPage = get4byte(aWrite); 004902 memcpy(aWrite, aSave, 4); 004903 }else 004904 #endif 004905 004906 { 004907 DbPage *pDbPage; 004908 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage, 004909 (eOp==0 ? PAGER_GET_READONLY : 0) 004910 ); 004911 if( rc==SQLITE_OK ){ 004912 aPayload = sqlite3PagerGetData(pDbPage); 004913 nextPage = get4byte(aPayload); 004914 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 004915 sqlite3PagerUnref(pDbPage); 004916 offset = 0; 004917 } 004918 } 004919 amt -= a; 004920 if( amt==0 ) return rc; 004921 pBuf += a; 004922 } 004923 if( rc ) break; 004924 iIdx++; 004925 } 004926 } 004927 004928 if( rc==SQLITE_OK && amt>0 ){ 004929 /* Overflow chain ends prematurely */ 004930 return SQLITE_CORRUPT_PAGE(pPage); 004931 } 004932 return rc; 004933 } 004934 004935 /* 004936 ** Read part of the payload for the row at which that cursor pCur is currently 004937 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer 004938 ** begins at "offset". 004939 ** 004940 ** pCur can be pointing to either a table or an index b-tree. 004941 ** If pointing to a table btree, then the content section is read. If 004942 ** pCur is pointing to an index b-tree then the key section is read. 004943 ** 004944 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing 004945 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the 004946 ** cursor might be invalid or might need to be restored before being read. 004947 ** 004948 ** Return SQLITE_OK on success or an error code if anything goes 004949 ** wrong. An error is returned if "offset+amt" is larger than 004950 ** the available payload. 004951 */ 004952 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 004953 assert( cursorHoldsMutex(pCur) ); 004954 assert( pCur->eState==CURSOR_VALID ); 004955 assert( pCur->iPage>=0 && pCur->pPage ); 004956 assert( pCur->ix<pCur->pPage->nCell ); 004957 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 004958 } 004959 004960 /* 004961 ** This variant of sqlite3BtreePayload() works even if the cursor has not 004962 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read() 004963 ** interface. 004964 */ 004965 #ifndef SQLITE_OMIT_INCRBLOB 004966 static SQLITE_NOINLINE int accessPayloadChecked( 004967 BtCursor *pCur, 004968 u32 offset, 004969 u32 amt, 004970 void *pBuf 004971 ){ 004972 int rc; 004973 if ( pCur->eState==CURSOR_INVALID ){ 004974 return SQLITE_ABORT; 004975 } 004976 assert( cursorOwnsBtShared(pCur) ); 004977 rc = btreeRestoreCursorPosition(pCur); 004978 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0); 004979 } 004980 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 004981 if( pCur->eState==CURSOR_VALID ){ 004982 assert( cursorOwnsBtShared(pCur) ); 004983 return accessPayload(pCur, offset, amt, pBuf, 0); 004984 }else{ 004985 return accessPayloadChecked(pCur, offset, amt, pBuf); 004986 } 004987 } 004988 #endif /* SQLITE_OMIT_INCRBLOB */ 004989 004990 /* 004991 ** Return a pointer to payload information from the entry that the 004992 ** pCur cursor is pointing to. The pointer is to the beginning of 004993 ** the key if index btrees (pPage->intKey==0) and is the data for 004994 ** table btrees (pPage->intKey==1). The number of bytes of available 004995 ** key/data is written into *pAmt. If *pAmt==0, then the value 004996 ** returned will not be a valid pointer. 004997 ** 004998 ** This routine is an optimization. It is common for the entire key 004999 ** and data to fit on the local page and for there to be no overflow 005000 ** pages. When that is so, this routine can be used to access the 005001 ** key and data without making a copy. If the key and/or data spills 005002 ** onto overflow pages, then accessPayload() must be used to reassemble 005003 ** the key/data and copy it into a preallocated buffer. 005004 ** 005005 ** The pointer returned by this routine looks directly into the cached 005006 ** page of the database. The data might change or move the next time 005007 ** any btree routine is called. 005008 */ 005009 static const void *fetchPayload( 005010 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005011 u32 *pAmt /* Write the number of available bytes here */ 005012 ){ 005013 int amt; 005014 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage); 005015 assert( pCur->eState==CURSOR_VALID ); 005016 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005017 assert( cursorOwnsBtShared(pCur) ); 005018 assert( pCur->ix<pCur->pPage->nCell ); 005019 assert( pCur->info.nSize>0 ); 005020 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB ); 005021 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB); 005022 amt = pCur->info.nLocal; 005023 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){ 005024 /* There is too little space on the page for the expected amount 005025 ** of local content. Database must be corrupt. */ 005026 assert( CORRUPT_DB ); 005027 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload)); 005028 } 005029 *pAmt = (u32)amt; 005030 return (void*)pCur->info.pPayload; 005031 } 005032 005033 005034 /* 005035 ** For the entry that cursor pCur is point to, return as 005036 ** many bytes of the key or data as are available on the local 005037 ** b-tree page. Write the number of available bytes into *pAmt. 005038 ** 005039 ** The pointer returned is ephemeral. The key/data may move 005040 ** or be destroyed on the next call to any Btree routine, 005041 ** including calls from other threads against the same cache. 005042 ** Hence, a mutex on the BtShared should be held prior to calling 005043 ** this routine. 005044 ** 005045 ** These routines is used to get quick access to key and data 005046 ** in the common case where no overflow pages are used. 005047 */ 005048 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){ 005049 return fetchPayload(pCur, pAmt); 005050 } 005051 005052 005053 /* 005054 ** Move the cursor down to a new child page. The newPgno argument is the 005055 ** page number of the child page to move to. 005056 ** 005057 ** This function returns SQLITE_CORRUPT if the page-header flags field of 005058 ** the new child page does not match the flags field of the parent (i.e. 005059 ** if an intkey page appears to be the parent of a non-intkey page, or 005060 ** vice-versa). 005061 */ 005062 static int moveToChild(BtCursor *pCur, u32 newPgno){ 005063 BtShared *pBt = pCur->pBt; 005064 005065 assert( cursorOwnsBtShared(pCur) ); 005066 assert( pCur->eState==CURSOR_VALID ); 005067 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 005068 assert( pCur->iPage>=0 ); 005069 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 005070 return SQLITE_CORRUPT_BKPT; 005071 } 005072 pCur->info.nSize = 0; 005073 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005074 pCur->aiIdx[pCur->iPage] = pCur->ix; 005075 pCur->apPage[pCur->iPage] = pCur->pPage; 005076 pCur->ix = 0; 005077 pCur->iPage++; 005078 return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags); 005079 } 005080 005081 #ifdef SQLITE_DEBUG 005082 /* 005083 ** Page pParent is an internal (non-leaf) tree page. This function 005084 ** asserts that page number iChild is the left-child if the iIdx'th 005085 ** cell in page pParent. Or, if iIdx is equal to the total number of 005086 ** cells in pParent, that page number iChild is the right-child of 005087 ** the page. 005088 */ 005089 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 005090 if( CORRUPT_DB ) return; /* The conditions tested below might not be true 005091 ** in a corrupt database */ 005092 assert( iIdx<=pParent->nCell ); 005093 if( iIdx==pParent->nCell ){ 005094 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 005095 }else{ 005096 assert( get4byte(findCell(pParent, iIdx))==iChild ); 005097 } 005098 } 005099 #else 005100 # define assertParentIndex(x,y,z) 005101 #endif 005102 005103 /* 005104 ** Move the cursor up to the parent page. 005105 ** 005106 ** pCur->idx is set to the cell index that contains the pointer 005107 ** to the page we are coming from. If we are coming from the 005108 ** right-most child page then pCur->idx is set to one more than 005109 ** the largest cell index. 005110 */ 005111 static void moveToParent(BtCursor *pCur){ 005112 MemPage *pLeaf; 005113 assert( cursorOwnsBtShared(pCur) ); 005114 assert( pCur->eState==CURSOR_VALID ); 005115 assert( pCur->iPage>0 ); 005116 assert( pCur->pPage ); 005117 assertParentIndex( 005118 pCur->apPage[pCur->iPage-1], 005119 pCur->aiIdx[pCur->iPage-1], 005120 pCur->pPage->pgno 005121 ); 005122 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); 005123 pCur->info.nSize = 0; 005124 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005125 pCur->ix = pCur->aiIdx[pCur->iPage-1]; 005126 pLeaf = pCur->pPage; 005127 pCur->pPage = pCur->apPage[--pCur->iPage]; 005128 releasePageNotNull(pLeaf); 005129 } 005130 005131 /* 005132 ** Move the cursor to point to the root page of its b-tree structure. 005133 ** 005134 ** If the table has a virtual root page, then the cursor is moved to point 005135 ** to the virtual root page instead of the actual root page. A table has a 005136 ** virtual root page when the actual root page contains no cells and a 005137 ** single child page. This can only happen with the table rooted at page 1. 005138 ** 005139 ** If the b-tree structure is empty, the cursor state is set to 005140 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise, 005141 ** the cursor is set to point to the first cell located on the root 005142 ** (or virtual root) page and the cursor state is set to CURSOR_VALID. 005143 ** 005144 ** If this function returns successfully, it may be assumed that the 005145 ** page-header flags indicate that the [virtual] root-page is the expected 005146 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 005147 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 005148 ** indicating a table b-tree, or if the caller did specify a KeyInfo 005149 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 005150 ** b-tree). 005151 */ 005152 static int moveToRoot(BtCursor *pCur){ 005153 MemPage *pRoot; 005154 int rc = SQLITE_OK; 005155 005156 assert( cursorOwnsBtShared(pCur) ); 005157 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 005158 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 005159 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 005160 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 ); 005161 assert( pCur->pgnoRoot>0 || pCur->iPage<0 ); 005162 005163 if( pCur->iPage>=0 ){ 005164 if( pCur->iPage ){ 005165 releasePageNotNull(pCur->pPage); 005166 while( --pCur->iPage ){ 005167 releasePageNotNull(pCur->apPage[pCur->iPage]); 005168 } 005169 pCur->pPage = pCur->apPage[0]; 005170 goto skip_init; 005171 } 005172 }else if( pCur->pgnoRoot==0 ){ 005173 pCur->eState = CURSOR_INVALID; 005174 return SQLITE_EMPTY; 005175 }else{ 005176 assert( pCur->iPage==(-1) ); 005177 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 005178 if( pCur->eState==CURSOR_FAULT ){ 005179 assert( pCur->skipNext!=SQLITE_OK ); 005180 return pCur->skipNext; 005181 } 005182 sqlite3BtreeClearCursor(pCur); 005183 } 005184 rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage, 005185 0, pCur->curPagerFlags); 005186 if( rc!=SQLITE_OK ){ 005187 pCur->eState = CURSOR_INVALID; 005188 return rc; 005189 } 005190 pCur->iPage = 0; 005191 pCur->curIntKey = pCur->pPage->intKey; 005192 } 005193 pRoot = pCur->pPage; 005194 assert( pRoot->pgno==pCur->pgnoRoot ); 005195 005196 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 005197 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 005198 ** NULL, the caller expects a table b-tree. If this is not the case, 005199 ** return an SQLITE_CORRUPT error. 005200 ** 005201 ** Earlier versions of SQLite assumed that this test could not fail 005202 ** if the root page was already loaded when this function was called (i.e. 005203 ** if pCur->iPage>=0). But this is not so if the database is corrupted 005204 ** in such a way that page pRoot is linked into a second b-tree table 005205 ** (or the freelist). */ 005206 assert( pRoot->intKey==1 || pRoot->intKey==0 ); 005207 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ 005208 return SQLITE_CORRUPT_PAGE(pCur->pPage); 005209 } 005210 005211 skip_init: 005212 pCur->ix = 0; 005213 pCur->info.nSize = 0; 005214 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); 005215 005216 pRoot = pCur->pPage; 005217 if( pRoot->nCell>0 ){ 005218 pCur->eState = CURSOR_VALID; 005219 }else if( !pRoot->leaf ){ 005220 Pgno subpage; 005221 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 005222 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 005223 pCur->eState = CURSOR_VALID; 005224 rc = moveToChild(pCur, subpage); 005225 }else{ 005226 pCur->eState = CURSOR_INVALID; 005227 rc = SQLITE_EMPTY; 005228 } 005229 return rc; 005230 } 005231 005232 /* 005233 ** Move the cursor down to the left-most leaf entry beneath the 005234 ** entry to which it is currently pointing. 005235 ** 005236 ** The left-most leaf is the one with the smallest key - the first 005237 ** in ascending order. 005238 */ 005239 static int moveToLeftmost(BtCursor *pCur){ 005240 Pgno pgno; 005241 int rc = SQLITE_OK; 005242 MemPage *pPage; 005243 005244 assert( cursorOwnsBtShared(pCur) ); 005245 assert( pCur->eState==CURSOR_VALID ); 005246 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 005247 assert( pCur->ix<pPage->nCell ); 005248 pgno = get4byte(findCell(pPage, pCur->ix)); 005249 rc = moveToChild(pCur, pgno); 005250 } 005251 return rc; 005252 } 005253 005254 /* 005255 ** Move the cursor down to the right-most leaf entry beneath the 005256 ** page to which it is currently pointing. Notice the difference 005257 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 005258 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 005259 ** finds the right-most entry beneath the *page*. 005260 ** 005261 ** The right-most entry is the one with the largest key - the last 005262 ** key in ascending order. 005263 */ 005264 static int moveToRightmost(BtCursor *pCur){ 005265 Pgno pgno; 005266 int rc = SQLITE_OK; 005267 MemPage *pPage = 0; 005268 005269 assert( cursorOwnsBtShared(pCur) ); 005270 assert( pCur->eState==CURSOR_VALID ); 005271 while( !(pPage = pCur->pPage)->leaf ){ 005272 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005273 pCur->ix = pPage->nCell; 005274 rc = moveToChild(pCur, pgno); 005275 if( rc ) return rc; 005276 } 005277 pCur->ix = pPage->nCell-1; 005278 assert( pCur->info.nSize==0 ); 005279 assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); 005280 return SQLITE_OK; 005281 } 005282 005283 /* Move the cursor to the first entry in the table. Return SQLITE_OK 005284 ** on success. Set *pRes to 0 if the cursor actually points to something 005285 ** or set *pRes to 1 if the table is empty. 005286 */ 005287 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 005288 int rc; 005289 005290 assert( cursorOwnsBtShared(pCur) ); 005291 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005292 rc = moveToRoot(pCur); 005293 if( rc==SQLITE_OK ){ 005294 assert( pCur->pPage->nCell>0 ); 005295 *pRes = 0; 005296 rc = moveToLeftmost(pCur); 005297 }else if( rc==SQLITE_EMPTY ){ 005298 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005299 *pRes = 1; 005300 rc = SQLITE_OK; 005301 } 005302 return rc; 005303 } 005304 005305 /* Move the cursor to the last entry in the table. Return SQLITE_OK 005306 ** on success. Set *pRes to 0 if the cursor actually points to something 005307 ** or set *pRes to 1 if the table is empty. 005308 */ 005309 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 005310 int rc; 005311 005312 assert( cursorOwnsBtShared(pCur) ); 005313 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005314 005315 /* If the cursor already points to the last entry, this is a no-op. */ 005316 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ 005317 #ifdef SQLITE_DEBUG 005318 /* This block serves to assert() that the cursor really does point 005319 ** to the last entry in the b-tree. */ 005320 int ii; 005321 for(ii=0; ii<pCur->iPage; ii++){ 005322 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 005323 } 005324 assert( pCur->ix==pCur->pPage->nCell-1 ); 005325 assert( pCur->pPage->leaf ); 005326 #endif 005327 *pRes = 0; 005328 return SQLITE_OK; 005329 } 005330 005331 rc = moveToRoot(pCur); 005332 if( rc==SQLITE_OK ){ 005333 assert( pCur->eState==CURSOR_VALID ); 005334 *pRes = 0; 005335 rc = moveToRightmost(pCur); 005336 if( rc==SQLITE_OK ){ 005337 pCur->curFlags |= BTCF_AtLast; 005338 }else{ 005339 pCur->curFlags &= ~BTCF_AtLast; 005340 } 005341 }else if( rc==SQLITE_EMPTY ){ 005342 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005343 *pRes = 1; 005344 rc = SQLITE_OK; 005345 } 005346 return rc; 005347 } 005348 005349 /* Move the cursor so that it points to an entry near the key 005350 ** specified by pIdxKey or intKey. Return a success code. 005351 ** 005352 ** For INTKEY tables, the intKey parameter is used. pIdxKey 005353 ** must be NULL. For index tables, pIdxKey is used and intKey 005354 ** is ignored. 005355 ** 005356 ** If an exact match is not found, then the cursor is always 005357 ** left pointing at a leaf page which would hold the entry if it 005358 ** were present. The cursor might point to an entry that comes 005359 ** before or after the key. 005360 ** 005361 ** An integer is written into *pRes which is the result of 005362 ** comparing the key with the entry to which the cursor is 005363 ** pointing. The meaning of the integer written into 005364 ** *pRes is as follows: 005365 ** 005366 ** *pRes<0 The cursor is left pointing at an entry that 005367 ** is smaller than intKey/pIdxKey or if the table is empty 005368 ** and the cursor is therefore left point to nothing. 005369 ** 005370 ** *pRes==0 The cursor is left pointing at an entry that 005371 ** exactly matches intKey/pIdxKey. 005372 ** 005373 ** *pRes>0 The cursor is left pointing at an entry that 005374 ** is larger than intKey/pIdxKey. 005375 ** 005376 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there 005377 ** exists an entry in the table that exactly matches pIdxKey. 005378 */ 005379 int sqlite3BtreeMovetoUnpacked( 005380 BtCursor *pCur, /* The cursor to be moved */ 005381 UnpackedRecord *pIdxKey, /* Unpacked index key */ 005382 i64 intKey, /* The table key */ 005383 int biasRight, /* If true, bias the search to the high end */ 005384 int *pRes /* Write search results here */ 005385 ){ 005386 int rc; 005387 RecordCompare xRecordCompare; 005388 005389 assert( cursorOwnsBtShared(pCur) ); 005390 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005391 assert( pRes ); 005392 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); 005393 assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) ); 005394 005395 /* If the cursor is already positioned at the point we are trying 005396 ** to move to, then just return without doing any work */ 005397 if( pIdxKey==0 005398 && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 005399 ){ 005400 if( pCur->info.nKey==intKey ){ 005401 *pRes = 0; 005402 return SQLITE_OK; 005403 } 005404 if( pCur->info.nKey<intKey ){ 005405 if( (pCur->curFlags & BTCF_AtLast)!=0 ){ 005406 *pRes = -1; 005407 return SQLITE_OK; 005408 } 005409 /* If the requested key is one more than the previous key, then 005410 ** try to get there using sqlite3BtreeNext() rather than a full 005411 ** binary search. This is an optimization only. The correct answer 005412 ** is still obtained without this case, only a little more slowely */ 005413 if( pCur->info.nKey+1==intKey ){ 005414 *pRes = 0; 005415 rc = sqlite3BtreeNext(pCur, 0); 005416 if( rc==SQLITE_OK ){ 005417 getCellInfo(pCur); 005418 if( pCur->info.nKey==intKey ){ 005419 return SQLITE_OK; 005420 } 005421 }else if( rc==SQLITE_DONE ){ 005422 rc = SQLITE_OK; 005423 }else{ 005424 return rc; 005425 } 005426 } 005427 } 005428 } 005429 005430 if( pIdxKey ){ 005431 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); 005432 pIdxKey->errCode = 0; 005433 assert( pIdxKey->default_rc==1 005434 || pIdxKey->default_rc==0 005435 || pIdxKey->default_rc==-1 005436 ); 005437 }else{ 005438 xRecordCompare = 0; /* All keys are integers */ 005439 } 005440 005441 rc = moveToRoot(pCur); 005442 if( rc ){ 005443 if( rc==SQLITE_EMPTY ){ 005444 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005445 *pRes = -1; 005446 return SQLITE_OK; 005447 } 005448 return rc; 005449 } 005450 assert( pCur->pPage ); 005451 assert( pCur->pPage->isInit ); 005452 assert( pCur->eState==CURSOR_VALID ); 005453 assert( pCur->pPage->nCell > 0 ); 005454 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); 005455 assert( pCur->curIntKey || pIdxKey ); 005456 for(;;){ 005457 int lwr, upr, idx, c; 005458 Pgno chldPg; 005459 MemPage *pPage = pCur->pPage; 005460 u8 *pCell; /* Pointer to current cell in pPage */ 005461 005462 /* pPage->nCell must be greater than zero. If this is the root-page 005463 ** the cursor would have been INVALID above and this for(;;) loop 005464 ** not run. If this is not the root-page, then the moveToChild() routine 005465 ** would have already detected db corruption. Similarly, pPage must 005466 ** be the right kind (index or table) of b-tree page. Otherwise 005467 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005468 assert( pPage->nCell>0 ); 005469 assert( pPage->intKey==(pIdxKey==0) ); 005470 lwr = 0; 005471 upr = pPage->nCell-1; 005472 assert( biasRight==0 || biasRight==1 ); 005473 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ 005474 pCur->ix = (u16)idx; 005475 if( xRecordCompare==0 ){ 005476 for(;;){ 005477 i64 nCellKey; 005478 pCell = findCellPastPtr(pPage, idx); 005479 if( pPage->intKeyLeaf ){ 005480 while( 0x80 <= *(pCell++) ){ 005481 if( pCell>=pPage->aDataEnd ){ 005482 return SQLITE_CORRUPT_PAGE(pPage); 005483 } 005484 } 005485 } 005486 getVarint(pCell, (u64*)&nCellKey); 005487 if( nCellKey<intKey ){ 005488 lwr = idx+1; 005489 if( lwr>upr ){ c = -1; break; } 005490 }else if( nCellKey>intKey ){ 005491 upr = idx-1; 005492 if( lwr>upr ){ c = +1; break; } 005493 }else{ 005494 assert( nCellKey==intKey ); 005495 pCur->ix = (u16)idx; 005496 if( !pPage->leaf ){ 005497 lwr = idx; 005498 goto moveto_next_layer; 005499 }else{ 005500 pCur->curFlags |= BTCF_ValidNKey; 005501 pCur->info.nKey = nCellKey; 005502 pCur->info.nSize = 0; 005503 *pRes = 0; 005504 return SQLITE_OK; 005505 } 005506 } 005507 assert( lwr+upr>=0 ); 005508 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ 005509 } 005510 }else{ 005511 for(;;){ 005512 int nCell; /* Size of the pCell cell in bytes */ 005513 pCell = findCellPastPtr(pPage, idx); 005514 005515 /* The maximum supported page-size is 65536 bytes. This means that 005516 ** the maximum number of record bytes stored on an index B-Tree 005517 ** page is less than 16384 bytes and may be stored as a 2-byte 005518 ** varint. This information is used to attempt to avoid parsing 005519 ** the entire cell by checking for the cases where the record is 005520 ** stored entirely within the b-tree page by inspecting the first 005521 ** 2 bytes of the cell. 005522 */ 005523 nCell = pCell[0]; 005524 if( nCell<=pPage->max1bytePayload ){ 005525 /* This branch runs if the record-size field of the cell is a 005526 ** single byte varint and the record fits entirely on the main 005527 ** b-tree page. */ 005528 testcase( pCell+nCell+1==pPage->aDataEnd ); 005529 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005530 }else if( !(pCell[1] & 0x80) 005531 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005532 ){ 005533 /* The record-size field is a 2 byte varint and the record 005534 ** fits entirely on the main b-tree page. */ 005535 testcase( pCell+nCell+2==pPage->aDataEnd ); 005536 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005537 }else{ 005538 /* The record flows over onto one or more overflow pages. In 005539 ** this case the whole cell needs to be parsed, a buffer allocated 005540 ** and accessPayload() used to retrieve the record into the 005541 ** buffer before VdbeRecordCompare() can be called. 005542 ** 005543 ** If the record is corrupt, the xRecordCompare routine may read 005544 ** up to two varints past the end of the buffer. An extra 18 005545 ** bytes of padding is allocated at the end of the buffer in 005546 ** case this happens. */ 005547 void *pCellKey; 005548 u8 * const pCellBody = pCell - pPage->childPtrSize; 005549 const int nOverrun = 18; /* Size of the overrun padding */ 005550 pPage->xParseCell(pPage, pCellBody, &pCur->info); 005551 nCell = (int)pCur->info.nKey; 005552 testcase( nCell<0 ); /* True if key size is 2^32 or more */ 005553 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ 005554 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ 005555 testcase( nCell==2 ); /* Minimum legal index key size */ 005556 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ 005557 rc = SQLITE_CORRUPT_PAGE(pPage); 005558 goto moveto_finish; 005559 } 005560 pCellKey = sqlite3Malloc( nCell+nOverrun ); 005561 if( pCellKey==0 ){ 005562 rc = SQLITE_NOMEM_BKPT; 005563 goto moveto_finish; 005564 } 005565 pCur->ix = (u16)idx; 005566 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 005567 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ 005568 pCur->curFlags &= ~BTCF_ValidOvfl; 005569 if( rc ){ 005570 sqlite3_free(pCellKey); 005571 goto moveto_finish; 005572 } 005573 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 005574 sqlite3_free(pCellKey); 005575 } 005576 assert( 005577 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) 005578 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) 005579 ); 005580 if( c<0 ){ 005581 lwr = idx+1; 005582 }else if( c>0 ){ 005583 upr = idx-1; 005584 }else{ 005585 assert( c==0 ); 005586 *pRes = 0; 005587 rc = SQLITE_OK; 005588 pCur->ix = (u16)idx; 005589 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; 005590 goto moveto_finish; 005591 } 005592 if( lwr>upr ) break; 005593 assert( lwr+upr>=0 ); 005594 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ 005595 } 005596 } 005597 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); 005598 assert( pPage->isInit ); 005599 if( pPage->leaf ){ 005600 assert( pCur->ix<pCur->pPage->nCell ); 005601 pCur->ix = (u16)idx; 005602 *pRes = c; 005603 rc = SQLITE_OK; 005604 goto moveto_finish; 005605 } 005606 moveto_next_layer: 005607 if( lwr>=pPage->nCell ){ 005608 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005609 }else{ 005610 chldPg = get4byte(findCell(pPage, lwr)); 005611 } 005612 pCur->ix = (u16)lwr; 005613 rc = moveToChild(pCur, chldPg); 005614 if( rc ) break; 005615 } 005616 moveto_finish: 005617 pCur->info.nSize = 0; 005618 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 005619 return rc; 005620 } 005621 005622 005623 /* 005624 ** Return TRUE if the cursor is not pointing at an entry of the table. 005625 ** 005626 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 005627 ** past the last entry in the table or sqlite3BtreePrev() moves past 005628 ** the first entry. TRUE is also returned if the table is empty. 005629 */ 005630 int sqlite3BtreeEof(BtCursor *pCur){ 005631 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 005632 ** have been deleted? This API will need to change to return an error code 005633 ** as well as the boolean result value. 005634 */ 005635 return (CURSOR_VALID!=pCur->eState); 005636 } 005637 005638 /* 005639 ** Return an estimate for the number of rows in the table that pCur is 005640 ** pointing to. Return a negative number if no estimate is currently 005641 ** available. 005642 */ 005643 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ 005644 i64 n; 005645 u8 i; 005646 005647 assert( cursorOwnsBtShared(pCur) ); 005648 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005649 005650 /* Currently this interface is only called by the OP_IfSmaller 005651 ** opcode, and it that case the cursor will always be valid and 005652 ** will always point to a leaf node. */ 005653 if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1; 005654 if( NEVER(pCur->pPage->leaf==0) ) return -1; 005655 005656 n = pCur->pPage->nCell; 005657 for(i=0; i<pCur->iPage; i++){ 005658 n *= pCur->apPage[i]->nCell; 005659 } 005660 return n; 005661 } 005662 005663 /* 005664 ** Advance the cursor to the next entry in the database. 005665 ** Return value: 005666 ** 005667 ** SQLITE_OK success 005668 ** SQLITE_DONE cursor is already pointing at the last element 005669 ** otherwise some kind of error occurred 005670 ** 005671 ** The main entry point is sqlite3BtreeNext(). That routine is optimized 005672 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx 005673 ** to the next cell on the current page. The (slower) btreeNext() helper 005674 ** routine is called when it is necessary to move to a different page or 005675 ** to restore the cursor. 005676 ** 005677 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the 005678 ** cursor corresponds to an SQL index and this routine could have been 005679 ** skipped if the SQL index had been a unique index. The F argument 005680 ** is a hint to the implement. SQLite btree implementation does not use 005681 ** this hint, but COMDB2 does. 005682 */ 005683 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ 005684 int rc; 005685 int idx; 005686 MemPage *pPage; 005687 005688 assert( cursorOwnsBtShared(pCur) ); 005689 if( pCur->eState!=CURSOR_VALID ){ 005690 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 005691 rc = restoreCursorPosition(pCur); 005692 if( rc!=SQLITE_OK ){ 005693 return rc; 005694 } 005695 if( CURSOR_INVALID==pCur->eState ){ 005696 return SQLITE_DONE; 005697 } 005698 if( pCur->eState==CURSOR_SKIPNEXT ){ 005699 pCur->eState = CURSOR_VALID; 005700 if( pCur->skipNext>0 ) return SQLITE_OK; 005701 } 005702 } 005703 005704 pPage = pCur->pPage; 005705 idx = ++pCur->ix; 005706 if( !pPage->isInit ){ 005707 /* The only known way for this to happen is for there to be a 005708 ** recursive SQL function that does a DELETE operation as part of a 005709 ** SELECT which deletes content out from under an active cursor 005710 ** in a corrupt database file where the table being DELETE-ed from 005711 ** has pages in common with the table being queried. See TH3 005712 ** module cov1/btree78.test testcase 220 (2018-06-08) for an 005713 ** example. */ 005714 return SQLITE_CORRUPT_BKPT; 005715 } 005716 005717 /* If the database file is corrupt, it is possible for the value of idx 005718 ** to be invalid here. This can only occur if a second cursor modifies 005719 ** the page while cursor pCur is holding a reference to it. Which can 005720 ** only happen if the database is corrupt in such a way as to link the 005721 ** page into more than one b-tree structure. 005722 ** 005723 ** Update 2019-12-23: appears to long longer be possible after the 005724 ** addition of anotherValidCursor() condition on balance_deeper(). */ 005725 harmless( idx>pPage->nCell ); 005726 005727 if( idx>=pPage->nCell ){ 005728 if( !pPage->leaf ){ 005729 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 005730 if( rc ) return rc; 005731 return moveToLeftmost(pCur); 005732 } 005733 do{ 005734 if( pCur->iPage==0 ){ 005735 pCur->eState = CURSOR_INVALID; 005736 return SQLITE_DONE; 005737 } 005738 moveToParent(pCur); 005739 pPage = pCur->pPage; 005740 }while( pCur->ix>=pPage->nCell ); 005741 if( pPage->intKey ){ 005742 return sqlite3BtreeNext(pCur, 0); 005743 }else{ 005744 return SQLITE_OK; 005745 } 005746 } 005747 if( pPage->leaf ){ 005748 return SQLITE_OK; 005749 }else{ 005750 return moveToLeftmost(pCur); 005751 } 005752 } 005753 int sqlite3BtreeNext(BtCursor *pCur, int flags){ 005754 MemPage *pPage; 005755 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 005756 assert( cursorOwnsBtShared(pCur) ); 005757 assert( flags==0 || flags==1 ); 005758 pCur->info.nSize = 0; 005759 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005760 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur); 005761 pPage = pCur->pPage; 005762 if( (++pCur->ix)>=pPage->nCell ){ 005763 pCur->ix--; 005764 return btreeNext(pCur); 005765 } 005766 if( pPage->leaf ){ 005767 return SQLITE_OK; 005768 }else{ 005769 return moveToLeftmost(pCur); 005770 } 005771 } 005772 005773 /* 005774 ** Step the cursor to the back to the previous entry in the database. 005775 ** Return values: 005776 ** 005777 ** SQLITE_OK success 005778 ** SQLITE_DONE the cursor is already on the first element of the table 005779 ** otherwise some kind of error occurred 005780 ** 005781 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized 005782 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx 005783 ** to the previous cell on the current page. The (slower) btreePrevious() 005784 ** helper routine is called when it is necessary to move to a different page 005785 ** or to restore the cursor. 005786 ** 005787 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then 005788 ** the cursor corresponds to an SQL index and this routine could have been 005789 ** skipped if the SQL index had been a unique index. The F argument is a 005790 ** hint to the implement. The native SQLite btree implementation does not 005791 ** use this hint, but COMDB2 does. 005792 */ 005793 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){ 005794 int rc; 005795 MemPage *pPage; 005796 005797 assert( cursorOwnsBtShared(pCur) ); 005798 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); 005799 assert( pCur->info.nSize==0 ); 005800 if( pCur->eState!=CURSOR_VALID ){ 005801 rc = restoreCursorPosition(pCur); 005802 if( rc!=SQLITE_OK ){ 005803 return rc; 005804 } 005805 if( CURSOR_INVALID==pCur->eState ){ 005806 return SQLITE_DONE; 005807 } 005808 if( CURSOR_SKIPNEXT==pCur->eState ){ 005809 pCur->eState = CURSOR_VALID; 005810 if( pCur->skipNext<0 ) return SQLITE_OK; 005811 } 005812 } 005813 005814 pPage = pCur->pPage; 005815 assert( pPage->isInit ); 005816 if( !pPage->leaf ){ 005817 int idx = pCur->ix; 005818 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 005819 if( rc ) return rc; 005820 rc = moveToRightmost(pCur); 005821 }else{ 005822 while( pCur->ix==0 ){ 005823 if( pCur->iPage==0 ){ 005824 pCur->eState = CURSOR_INVALID; 005825 return SQLITE_DONE; 005826 } 005827 moveToParent(pCur); 005828 } 005829 assert( pCur->info.nSize==0 ); 005830 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 ); 005831 005832 pCur->ix--; 005833 pPage = pCur->pPage; 005834 if( pPage->intKey && !pPage->leaf ){ 005835 rc = sqlite3BtreePrevious(pCur, 0); 005836 }else{ 005837 rc = SQLITE_OK; 005838 } 005839 } 005840 return rc; 005841 } 005842 int sqlite3BtreePrevious(BtCursor *pCur, int flags){ 005843 assert( cursorOwnsBtShared(pCur) ); 005844 assert( flags==0 || flags==1 ); 005845 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 005846 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); 005847 pCur->info.nSize = 0; 005848 if( pCur->eState!=CURSOR_VALID 005849 || pCur->ix==0 005850 || pCur->pPage->leaf==0 005851 ){ 005852 return btreePrevious(pCur); 005853 } 005854 pCur->ix--; 005855 return SQLITE_OK; 005856 } 005857 005858 /* 005859 ** Allocate a new page from the database file. 005860 ** 005861 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 005862 ** has already been called on the new page.) The new page has also 005863 ** been referenced and the calling routine is responsible for calling 005864 ** sqlite3PagerUnref() on the new page when it is done. 005865 ** 005866 ** SQLITE_OK is returned on success. Any other return value indicates 005867 ** an error. *ppPage is set to NULL in the event of an error. 005868 ** 005869 ** If the "nearby" parameter is not 0, then an effort is made to 005870 ** locate a page close to the page number "nearby". This can be used in an 005871 ** attempt to keep related pages close to each other in the database file, 005872 ** which in turn can make database access faster. 005873 ** 005874 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists 005875 ** anywhere on the free-list, then it is guaranteed to be returned. If 005876 ** eMode is BTALLOC_LT then the page returned will be less than or equal 005877 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there 005878 ** are no restrictions on which page is returned. 005879 */ 005880 static int allocateBtreePage( 005881 BtShared *pBt, /* The btree */ 005882 MemPage **ppPage, /* Store pointer to the allocated page here */ 005883 Pgno *pPgno, /* Store the page number here */ 005884 Pgno nearby, /* Search for a page near this one */ 005885 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */ 005886 ){ 005887 MemPage *pPage1; 005888 int rc; 005889 u32 n; /* Number of pages on the freelist */ 005890 u32 k; /* Number of leaves on the trunk of the freelist */ 005891 MemPage *pTrunk = 0; 005892 MemPage *pPrevTrunk = 0; 005893 Pgno mxPage; /* Total size of the database file */ 005894 005895 assert( sqlite3_mutex_held(pBt->mutex) ); 005896 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); 005897 pPage1 = pBt->pPage1; 005898 mxPage = btreePagecount(pBt); 005899 /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36 005900 ** stores stores the total number of pages on the freelist. */ 005901 n = get4byte(&pPage1->aData[36]); 005902 testcase( n==mxPage-1 ); 005903 if( n>=mxPage ){ 005904 return SQLITE_CORRUPT_BKPT; 005905 } 005906 if( n>0 ){ 005907 /* There are pages on the freelist. Reuse one of those pages. */ 005908 Pgno iTrunk; 005909 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 005910 u32 nSearch = 0; /* Count of the number of search attempts */ 005911 005912 /* If eMode==BTALLOC_EXACT and a query of the pointer-map 005913 ** shows that the page 'nearby' is somewhere on the free-list, then 005914 ** the entire-list will be searched for that page. 005915 */ 005916 #ifndef SQLITE_OMIT_AUTOVACUUM 005917 if( eMode==BTALLOC_EXACT ){ 005918 if( nearby<=mxPage ){ 005919 u8 eType; 005920 assert( nearby>0 ); 005921 assert( pBt->autoVacuum ); 005922 rc = ptrmapGet(pBt, nearby, &eType, 0); 005923 if( rc ) return rc; 005924 if( eType==PTRMAP_FREEPAGE ){ 005925 searchList = 1; 005926 } 005927 } 005928 }else if( eMode==BTALLOC_LE ){ 005929 searchList = 1; 005930 } 005931 #endif 005932 005933 /* Decrement the free-list count by 1. Set iTrunk to the index of the 005934 ** first free-list trunk page. iPrevTrunk is initially 1. 005935 */ 005936 rc = sqlite3PagerWrite(pPage1->pDbPage); 005937 if( rc ) return rc; 005938 put4byte(&pPage1->aData[36], n-1); 005939 005940 /* The code within this loop is run only once if the 'searchList' variable 005941 ** is not true. Otherwise, it runs once for each trunk-page on the 005942 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT) 005943 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT) 005944 */ 005945 do { 005946 pPrevTrunk = pTrunk; 005947 if( pPrevTrunk ){ 005948 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page 005949 ** is the page number of the next freelist trunk page in the list or 005950 ** zero if this is the last freelist trunk page. */ 005951 iTrunk = get4byte(&pPrevTrunk->aData[0]); 005952 }else{ 005953 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 005954 ** stores the page number of the first page of the freelist, or zero if 005955 ** the freelist is empty. */ 005956 iTrunk = get4byte(&pPage1->aData[32]); 005957 } 005958 testcase( iTrunk==mxPage ); 005959 if( iTrunk>mxPage || nSearch++ > n ){ 005960 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1); 005961 }else{ 005962 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0); 005963 } 005964 if( rc ){ 005965 pTrunk = 0; 005966 goto end_allocate_page; 005967 } 005968 assert( pTrunk!=0 ); 005969 assert( pTrunk->aData!=0 ); 005970 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page 005971 ** is the number of leaf page pointers to follow. */ 005972 k = get4byte(&pTrunk->aData[4]); 005973 if( k==0 && !searchList ){ 005974 /* The trunk has no leaves and the list is not being searched. 005975 ** So extract the trunk page itself and use it as the newly 005976 ** allocated page */ 005977 assert( pPrevTrunk==0 ); 005978 rc = sqlite3PagerWrite(pTrunk->pDbPage); 005979 if( rc ){ 005980 goto end_allocate_page; 005981 } 005982 *pPgno = iTrunk; 005983 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 005984 *ppPage = pTrunk; 005985 pTrunk = 0; 005986 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 005987 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 005988 /* Value of k is out of range. Database corruption */ 005989 rc = SQLITE_CORRUPT_PGNO(iTrunk); 005990 goto end_allocate_page; 005991 #ifndef SQLITE_OMIT_AUTOVACUUM 005992 }else if( searchList 005993 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 005994 ){ 005995 /* The list is being searched and this trunk page is the page 005996 ** to allocate, regardless of whether it has leaves. 005997 */ 005998 *pPgno = iTrunk; 005999 *ppPage = pTrunk; 006000 searchList = 0; 006001 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006002 if( rc ){ 006003 goto end_allocate_page; 006004 } 006005 if( k==0 ){ 006006 if( !pPrevTrunk ){ 006007 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006008 }else{ 006009 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006010 if( rc!=SQLITE_OK ){ 006011 goto end_allocate_page; 006012 } 006013 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 006014 } 006015 }else{ 006016 /* The trunk page is required by the caller but it contains 006017 ** pointers to free-list leaves. The first leaf becomes a trunk 006018 ** page in this case. 006019 */ 006020 MemPage *pNewTrunk; 006021 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 006022 if( iNewTrunk>mxPage ){ 006023 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006024 goto end_allocate_page; 006025 } 006026 testcase( iNewTrunk==mxPage ); 006027 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0); 006028 if( rc!=SQLITE_OK ){ 006029 goto end_allocate_page; 006030 } 006031 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 006032 if( rc!=SQLITE_OK ){ 006033 releasePage(pNewTrunk); 006034 goto end_allocate_page; 006035 } 006036 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 006037 put4byte(&pNewTrunk->aData[4], k-1); 006038 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 006039 releasePage(pNewTrunk); 006040 if( !pPrevTrunk ){ 006041 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 006042 put4byte(&pPage1->aData[32], iNewTrunk); 006043 }else{ 006044 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006045 if( rc ){ 006046 goto end_allocate_page; 006047 } 006048 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 006049 } 006050 } 006051 pTrunk = 0; 006052 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 006053 #endif 006054 }else if( k>0 ){ 006055 /* Extract a leaf from the trunk */ 006056 u32 closest; 006057 Pgno iPage; 006058 unsigned char *aData = pTrunk->aData; 006059 if( nearby>0 ){ 006060 u32 i; 006061 closest = 0; 006062 if( eMode==BTALLOC_LE ){ 006063 for(i=0; i<k; i++){ 006064 iPage = get4byte(&aData[8+i*4]); 006065 if( iPage<=nearby ){ 006066 closest = i; 006067 break; 006068 } 006069 } 006070 }else{ 006071 int dist; 006072 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 006073 for(i=1; i<k; i++){ 006074 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 006075 if( d2<dist ){ 006076 closest = i; 006077 dist = d2; 006078 } 006079 } 006080 } 006081 }else{ 006082 closest = 0; 006083 } 006084 006085 iPage = get4byte(&aData[8+closest*4]); 006086 testcase( iPage==mxPage ); 006087 if( iPage>mxPage ){ 006088 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006089 goto end_allocate_page; 006090 } 006091 testcase( iPage==mxPage ); 006092 if( !searchList 006093 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 006094 ){ 006095 int noContent; 006096 *pPgno = iPage; 006097 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 006098 ": %d more free pages\n", 006099 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 006100 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006101 if( rc ) goto end_allocate_page; 006102 if( closest<k-1 ){ 006103 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 006104 } 006105 put4byte(&aData[4], k-1); 006106 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; 006107 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent); 006108 if( rc==SQLITE_OK ){ 006109 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006110 if( rc!=SQLITE_OK ){ 006111 releasePage(*ppPage); 006112 *ppPage = 0; 006113 } 006114 } 006115 searchList = 0; 006116 } 006117 } 006118 releasePage(pPrevTrunk); 006119 pPrevTrunk = 0; 006120 }while( searchList ); 006121 }else{ 006122 /* There are no pages on the freelist, so append a new page to the 006123 ** database image. 006124 ** 006125 ** Normally, new pages allocated by this block can be requested from the 006126 ** pager layer with the 'no-content' flag set. This prevents the pager 006127 ** from trying to read the pages content from disk. However, if the 006128 ** current transaction has already run one or more incremental-vacuum 006129 ** steps, then the page we are about to allocate may contain content 006130 ** that is required in the event of a rollback. In this case, do 006131 ** not set the no-content flag. This causes the pager to load and journal 006132 ** the current page content before overwriting it. 006133 ** 006134 ** Note that the pager will not actually attempt to load or journal 006135 ** content for any page that really does lie past the end of the database 006136 ** file on disk. So the effects of disabling the no-content optimization 006137 ** here are confined to those pages that lie between the end of the 006138 ** database image and the end of the database file. 006139 */ 006140 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; 006141 006142 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 006143 if( rc ) return rc; 006144 pBt->nPage++; 006145 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 006146 006147 #ifndef SQLITE_OMIT_AUTOVACUUM 006148 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 006149 /* If *pPgno refers to a pointer-map page, allocate two new pages 006150 ** at the end of the file instead of one. The first allocated page 006151 ** becomes a new pointer-map page, the second is used by the caller. 006152 */ 006153 MemPage *pPg = 0; 006154 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); 006155 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 006156 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); 006157 if( rc==SQLITE_OK ){ 006158 rc = sqlite3PagerWrite(pPg->pDbPage); 006159 releasePage(pPg); 006160 } 006161 if( rc ) return rc; 006162 pBt->nPage++; 006163 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 006164 } 006165 #endif 006166 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 006167 *pPgno = pBt->nPage; 006168 006169 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006170 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent); 006171 if( rc ) return rc; 006172 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006173 if( rc!=SQLITE_OK ){ 006174 releasePage(*ppPage); 006175 *ppPage = 0; 006176 } 006177 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 006178 } 006179 006180 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006181 006182 end_allocate_page: 006183 releasePage(pTrunk); 006184 releasePage(pPrevTrunk); 006185 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 ); 006186 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 ); 006187 return rc; 006188 } 006189 006190 /* 006191 ** This function is used to add page iPage to the database file free-list. 006192 ** It is assumed that the page is not already a part of the free-list. 006193 ** 006194 ** The value passed as the second argument to this function is optional. 006195 ** If the caller happens to have a pointer to the MemPage object 006196 ** corresponding to page iPage handy, it may pass it as the second value. 006197 ** Otherwise, it may pass NULL. 006198 ** 006199 ** If a pointer to a MemPage object is passed as the second argument, 006200 ** its reference count is not altered by this function. 006201 */ 006202 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 006203 MemPage *pTrunk = 0; /* Free-list trunk page */ 006204 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 006205 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 006206 MemPage *pPage; /* Page being freed. May be NULL. */ 006207 int rc; /* Return Code */ 006208 u32 nFree; /* Initial number of pages on free-list */ 006209 006210 assert( sqlite3_mutex_held(pBt->mutex) ); 006211 assert( CORRUPT_DB || iPage>1 ); 006212 assert( !pMemPage || pMemPage->pgno==iPage ); 006213 006214 if( iPage<2 || iPage>pBt->nPage ){ 006215 return SQLITE_CORRUPT_BKPT; 006216 } 006217 if( pMemPage ){ 006218 pPage = pMemPage; 006219 sqlite3PagerRef(pPage->pDbPage); 006220 }else{ 006221 pPage = btreePageLookup(pBt, iPage); 006222 } 006223 006224 /* Increment the free page count on pPage1 */ 006225 rc = sqlite3PagerWrite(pPage1->pDbPage); 006226 if( rc ) goto freepage_out; 006227 nFree = get4byte(&pPage1->aData[36]); 006228 put4byte(&pPage1->aData[36], nFree+1); 006229 006230 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 006231 /* If the secure_delete option is enabled, then 006232 ** always fully overwrite deleted information with zeros. 006233 */ 006234 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 006235 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 006236 ){ 006237 goto freepage_out; 006238 } 006239 memset(pPage->aData, 0, pPage->pBt->pageSize); 006240 } 006241 006242 /* If the database supports auto-vacuum, write an entry in the pointer-map 006243 ** to indicate that the page is free. 006244 */ 006245 if( ISAUTOVACUUM ){ 006246 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 006247 if( rc ) goto freepage_out; 006248 } 006249 006250 /* Now manipulate the actual database free-list structure. There are two 006251 ** possibilities. If the free-list is currently empty, or if the first 006252 ** trunk page in the free-list is full, then this page will become a 006253 ** new free-list trunk page. Otherwise, it will become a leaf of the 006254 ** first trunk page in the current free-list. This block tests if it 006255 ** is possible to add the page as a new free-list leaf. 006256 */ 006257 if( nFree!=0 ){ 006258 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 006259 006260 iTrunk = get4byte(&pPage1->aData[32]); 006261 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 006262 if( rc!=SQLITE_OK ){ 006263 goto freepage_out; 006264 } 006265 006266 nLeaf = get4byte(&pTrunk->aData[4]); 006267 assert( pBt->usableSize>32 ); 006268 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 006269 rc = SQLITE_CORRUPT_BKPT; 006270 goto freepage_out; 006271 } 006272 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 006273 /* In this case there is room on the trunk page to insert the page 006274 ** being freed as a new leaf. 006275 ** 006276 ** Note that the trunk page is not really full until it contains 006277 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 006278 ** coded. But due to a coding error in versions of SQLite prior to 006279 ** 3.6.0, databases with freelist trunk pages holding more than 006280 ** usableSize/4 - 8 entries will be reported as corrupt. In order 006281 ** to maintain backwards compatibility with older versions of SQLite, 006282 ** we will continue to restrict the number of entries to usableSize/4 - 8 006283 ** for now. At some point in the future (once everyone has upgraded 006284 ** to 3.6.0 or later) we should consider fixing the conditional above 006285 ** to read "usableSize/4-2" instead of "usableSize/4-8". 006286 ** 006287 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still 006288 ** avoid using the last six entries in the freelist trunk page array in 006289 ** order that database files created by newer versions of SQLite can be 006290 ** read by older versions of SQLite. 006291 */ 006292 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006293 if( rc==SQLITE_OK ){ 006294 put4byte(&pTrunk->aData[4], nLeaf+1); 006295 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 006296 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){ 006297 sqlite3PagerDontWrite(pPage->pDbPage); 006298 } 006299 rc = btreeSetHasContent(pBt, iPage); 006300 } 006301 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 006302 goto freepage_out; 006303 } 006304 } 006305 006306 /* If control flows to this point, then it was not possible to add the 006307 ** the page being freed as a leaf page of the first trunk in the free-list. 006308 ** Possibly because the free-list is empty, or possibly because the 006309 ** first trunk in the free-list is full. Either way, the page being freed 006310 ** will become the new first trunk page in the free-list. 006311 */ 006312 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 006313 goto freepage_out; 006314 } 006315 rc = sqlite3PagerWrite(pPage->pDbPage); 006316 if( rc!=SQLITE_OK ){ 006317 goto freepage_out; 006318 } 006319 put4byte(pPage->aData, iTrunk); 006320 put4byte(&pPage->aData[4], 0); 006321 put4byte(&pPage1->aData[32], iPage); 006322 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); 006323 006324 freepage_out: 006325 if( pPage ){ 006326 pPage->isInit = 0; 006327 } 006328 releasePage(pPage); 006329 releasePage(pTrunk); 006330 return rc; 006331 } 006332 static void freePage(MemPage *pPage, int *pRC){ 006333 if( (*pRC)==SQLITE_OK ){ 006334 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 006335 } 006336 } 006337 006338 /* 006339 ** Free any overflow pages associated with the given Cell. Store 006340 ** size information about the cell in pInfo. 006341 */ 006342 static int clearCell( 006343 MemPage *pPage, /* The page that contains the Cell */ 006344 unsigned char *pCell, /* First byte of the Cell */ 006345 CellInfo *pInfo /* Size information about the cell */ 006346 ){ 006347 BtShared *pBt; 006348 Pgno ovflPgno; 006349 int rc; 006350 int nOvfl; 006351 u32 ovflPageSize; 006352 006353 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006354 pPage->xParseCell(pPage, pCell, pInfo); 006355 if( pInfo->nLocal==pInfo->nPayload ){ 006356 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 006357 } 006358 testcase( pCell + pInfo->nSize == pPage->aDataEnd ); 006359 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd ); 006360 if( pCell + pInfo->nSize > pPage->aDataEnd ){ 006361 /* Cell extends past end of page */ 006362 return SQLITE_CORRUPT_PAGE(pPage); 006363 } 006364 ovflPgno = get4byte(pCell + pInfo->nSize - 4); 006365 pBt = pPage->pBt; 006366 assert( pBt->usableSize > 4 ); 006367 ovflPageSize = pBt->usableSize - 4; 006368 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize; 006369 assert( nOvfl>0 || 006370 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize) 006371 ); 006372 while( nOvfl-- ){ 006373 Pgno iNext = 0; 006374 MemPage *pOvfl = 0; 006375 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 006376 /* 0 is not a legal page number and page 1 cannot be an 006377 ** overflow page. Therefore if ovflPgno<2 or past the end of the 006378 ** file the database must be corrupt. */ 006379 return SQLITE_CORRUPT_BKPT; 006380 } 006381 if( nOvfl ){ 006382 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 006383 if( rc ) return rc; 006384 } 006385 006386 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 006387 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 006388 ){ 006389 /* There is no reason any cursor should have an outstanding reference 006390 ** to an overflow page belonging to a cell that is being deleted/updated. 006391 ** So if there exists more than one reference to this page, then it 006392 ** must not really be an overflow page and the database must be corrupt. 006393 ** It is helpful to detect this before calling freePage2(), as 006394 ** freePage2() may zero the page contents if secure-delete mode is 006395 ** enabled. If this 'overflow' page happens to be a page that the 006396 ** caller is iterating through or using in some other way, this 006397 ** can be problematic. 006398 */ 006399 rc = SQLITE_CORRUPT_BKPT; 006400 }else{ 006401 rc = freePage2(pBt, pOvfl, ovflPgno); 006402 } 006403 006404 if( pOvfl ){ 006405 sqlite3PagerUnref(pOvfl->pDbPage); 006406 } 006407 if( rc ) return rc; 006408 ovflPgno = iNext; 006409 } 006410 return SQLITE_OK; 006411 } 006412 006413 /* 006414 ** Create the byte sequence used to represent a cell on page pPage 006415 ** and write that byte sequence into pCell[]. Overflow pages are 006416 ** allocated and filled in as necessary. The calling procedure 006417 ** is responsible for making sure sufficient space has been allocated 006418 ** for pCell[]. 006419 ** 006420 ** Note that pCell does not necessary need to point to the pPage->aData 006421 ** area. pCell might point to some temporary storage. The cell will 006422 ** be constructed in this temporary area then copied into pPage->aData 006423 ** later. 006424 */ 006425 static int fillInCell( 006426 MemPage *pPage, /* The page that contains the cell */ 006427 unsigned char *pCell, /* Complete text of the cell */ 006428 const BtreePayload *pX, /* Payload with which to construct the cell */ 006429 int *pnSize /* Write cell size here */ 006430 ){ 006431 int nPayload; 006432 const u8 *pSrc; 006433 int nSrc, n, rc, mn; 006434 int spaceLeft; 006435 MemPage *pToRelease; 006436 unsigned char *pPrior; 006437 unsigned char *pPayload; 006438 BtShared *pBt; 006439 Pgno pgnoOvfl; 006440 int nHeader; 006441 006442 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006443 006444 /* pPage is not necessarily writeable since pCell might be auxiliary 006445 ** buffer space that is separate from the pPage buffer area */ 006446 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize] 006447 || sqlite3PagerIswriteable(pPage->pDbPage) ); 006448 006449 /* Fill in the header. */ 006450 nHeader = pPage->childPtrSize; 006451 if( pPage->intKey ){ 006452 nPayload = pX->nData + pX->nZero; 006453 pSrc = pX->pData; 006454 nSrc = pX->nData; 006455 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */ 006456 nHeader += putVarint32(&pCell[nHeader], nPayload); 006457 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey); 006458 }else{ 006459 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 ); 006460 nSrc = nPayload = (int)pX->nKey; 006461 pSrc = pX->pKey; 006462 nHeader += putVarint32(&pCell[nHeader], nPayload); 006463 } 006464 006465 /* Fill in the payload */ 006466 pPayload = &pCell[nHeader]; 006467 if( nPayload<=pPage->maxLocal ){ 006468 /* This is the common case where everything fits on the btree page 006469 ** and no overflow pages are required. */ 006470 n = nHeader + nPayload; 006471 testcase( n==3 ); 006472 testcase( n==4 ); 006473 if( n<4 ) n = 4; 006474 *pnSize = n; 006475 assert( nSrc<=nPayload ); 006476 testcase( nSrc<nPayload ); 006477 memcpy(pPayload, pSrc, nSrc); 006478 memset(pPayload+nSrc, 0, nPayload-nSrc); 006479 return SQLITE_OK; 006480 } 006481 006482 /* If we reach this point, it means that some of the content will need 006483 ** to spill onto overflow pages. 006484 */ 006485 mn = pPage->minLocal; 006486 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); 006487 testcase( n==pPage->maxLocal ); 006488 testcase( n==pPage->maxLocal+1 ); 006489 if( n > pPage->maxLocal ) n = mn; 006490 spaceLeft = n; 006491 *pnSize = n + nHeader + 4; 006492 pPrior = &pCell[nHeader+n]; 006493 pToRelease = 0; 006494 pgnoOvfl = 0; 006495 pBt = pPage->pBt; 006496 006497 /* At this point variables should be set as follows: 006498 ** 006499 ** nPayload Total payload size in bytes 006500 ** pPayload Begin writing payload here 006501 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, 006502 ** that means content must spill into overflow pages. 006503 ** *pnSize Size of the local cell (not counting overflow pages) 006504 ** pPrior Where to write the pgno of the first overflow page 006505 ** 006506 ** Use a call to btreeParseCellPtr() to verify that the values above 006507 ** were computed correctly. 006508 */ 006509 #ifdef SQLITE_DEBUG 006510 { 006511 CellInfo info; 006512 pPage->xParseCell(pPage, pCell, &info); 006513 assert( nHeader==(int)(info.pPayload - pCell) ); 006514 assert( info.nKey==pX->nKey ); 006515 assert( *pnSize == info.nSize ); 006516 assert( spaceLeft == info.nLocal ); 006517 } 006518 #endif 006519 006520 /* Write the payload into the local Cell and any extra into overflow pages */ 006521 while( 1 ){ 006522 n = nPayload; 006523 if( n>spaceLeft ) n = spaceLeft; 006524 006525 /* If pToRelease is not zero than pPayload points into the data area 006526 ** of pToRelease. Make sure pToRelease is still writeable. */ 006527 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 006528 006529 /* If pPayload is part of the data area of pPage, then make sure pPage 006530 ** is still writeable */ 006531 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 006532 || sqlite3PagerIswriteable(pPage->pDbPage) ); 006533 006534 if( nSrc>=n ){ 006535 memcpy(pPayload, pSrc, n); 006536 }else if( nSrc>0 ){ 006537 n = nSrc; 006538 memcpy(pPayload, pSrc, n); 006539 }else{ 006540 memset(pPayload, 0, n); 006541 } 006542 nPayload -= n; 006543 if( nPayload<=0 ) break; 006544 pPayload += n; 006545 pSrc += n; 006546 nSrc -= n; 006547 spaceLeft -= n; 006548 if( spaceLeft==0 ){ 006549 MemPage *pOvfl = 0; 006550 #ifndef SQLITE_OMIT_AUTOVACUUM 006551 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 006552 if( pBt->autoVacuum ){ 006553 do{ 006554 pgnoOvfl++; 006555 } while( 006556 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 006557 ); 006558 } 006559 #endif 006560 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 006561 #ifndef SQLITE_OMIT_AUTOVACUUM 006562 /* If the database supports auto-vacuum, and the second or subsequent 006563 ** overflow page is being allocated, add an entry to the pointer-map 006564 ** for that page now. 006565 ** 006566 ** If this is the first overflow page, then write a partial entry 006567 ** to the pointer-map. If we write nothing to this pointer-map slot, 006568 ** then the optimistic overflow chain processing in clearCell() 006569 ** may misinterpret the uninitialized values and delete the 006570 ** wrong pages from the database. 006571 */ 006572 if( pBt->autoVacuum && rc==SQLITE_OK ){ 006573 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 006574 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 006575 if( rc ){ 006576 releasePage(pOvfl); 006577 } 006578 } 006579 #endif 006580 if( rc ){ 006581 releasePage(pToRelease); 006582 return rc; 006583 } 006584 006585 /* If pToRelease is not zero than pPrior points into the data area 006586 ** of pToRelease. Make sure pToRelease is still writeable. */ 006587 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 006588 006589 /* If pPrior is part of the data area of pPage, then make sure pPage 006590 ** is still writeable */ 006591 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 006592 || sqlite3PagerIswriteable(pPage->pDbPage) ); 006593 006594 put4byte(pPrior, pgnoOvfl); 006595 releasePage(pToRelease); 006596 pToRelease = pOvfl; 006597 pPrior = pOvfl->aData; 006598 put4byte(pPrior, 0); 006599 pPayload = &pOvfl->aData[4]; 006600 spaceLeft = pBt->usableSize - 4; 006601 } 006602 } 006603 releasePage(pToRelease); 006604 return SQLITE_OK; 006605 } 006606 006607 /* 006608 ** Remove the i-th cell from pPage. This routine effects pPage only. 006609 ** The cell content is not freed or deallocated. It is assumed that 006610 ** the cell content has been copied someplace else. This routine just 006611 ** removes the reference to the cell from pPage. 006612 ** 006613 ** "sz" must be the number of bytes in the cell. 006614 */ 006615 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 006616 u32 pc; /* Offset to cell content of cell being deleted */ 006617 u8 *data; /* pPage->aData */ 006618 u8 *ptr; /* Used to move bytes around within data[] */ 006619 int rc; /* The return code */ 006620 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 006621 006622 if( *pRC ) return; 006623 assert( idx>=0 && idx<pPage->nCell ); 006624 assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); 006625 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 006626 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006627 assert( pPage->nFree>=0 ); 006628 data = pPage->aData; 006629 ptr = &pPage->aCellIdx[2*idx]; 006630 pc = get2byte(ptr); 006631 hdr = pPage->hdrOffset; 006632 testcase( pc==get2byte(&data[hdr+5]) ); 006633 testcase( pc+sz==pPage->pBt->usableSize ); 006634 if( pc+sz > pPage->pBt->usableSize ){ 006635 *pRC = SQLITE_CORRUPT_BKPT; 006636 return; 006637 } 006638 rc = freeSpace(pPage, pc, sz); 006639 if( rc ){ 006640 *pRC = rc; 006641 return; 006642 } 006643 pPage->nCell--; 006644 if( pPage->nCell==0 ){ 006645 memset(&data[hdr+1], 0, 4); 006646 data[hdr+7] = 0; 006647 put2byte(&data[hdr+5], pPage->pBt->usableSize); 006648 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset 006649 - pPage->childPtrSize - 8; 006650 }else{ 006651 memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); 006652 put2byte(&data[hdr+3], pPage->nCell); 006653 pPage->nFree += 2; 006654 } 006655 } 006656 006657 /* 006658 ** Insert a new cell on pPage at cell index "i". pCell points to the 006659 ** content of the cell. 006660 ** 006661 ** If the cell content will fit on the page, then put it there. If it 006662 ** will not fit, then make a copy of the cell content into pTemp if 006663 ** pTemp is not null. Regardless of pTemp, allocate a new entry 006664 ** in pPage->apOvfl[] and make it point to the cell content (either 006665 ** in pTemp or the original pCell) and also record its index. 006666 ** Allocating a new entry in pPage->aCell[] implies that 006667 ** pPage->nOverflow is incremented. 006668 ** 006669 ** *pRC must be SQLITE_OK when this routine is called. 006670 */ 006671 static void insertCell( 006672 MemPage *pPage, /* Page into which we are copying */ 006673 int i, /* New cell becomes the i-th cell of the page */ 006674 u8 *pCell, /* Content of the new cell */ 006675 int sz, /* Bytes of content in pCell */ 006676 u8 *pTemp, /* Temp storage space for pCell, if needed */ 006677 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ 006678 int *pRC /* Read and write return code from here */ 006679 ){ 006680 int idx = 0; /* Where to write new cell content in data[] */ 006681 int j; /* Loop counter */ 006682 u8 *data; /* The content of the whole page */ 006683 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 006684 006685 assert( *pRC==SQLITE_OK ); 006686 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 006687 assert( MX_CELL(pPage->pBt)<=10921 ); 006688 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 006689 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 006690 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 006691 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006692 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 006693 assert( pPage->nFree>=0 ); 006694 if( pPage->nOverflow || sz+2>pPage->nFree ){ 006695 if( pTemp ){ 006696 memcpy(pTemp, pCell, sz); 006697 pCell = pTemp; 006698 } 006699 if( iChild ){ 006700 put4byte(pCell, iChild); 006701 } 006702 j = pPage->nOverflow++; 006703 /* Comparison against ArraySize-1 since we hold back one extra slot 006704 ** as a contingency. In other words, never need more than 3 overflow 006705 ** slots but 4 are allocated, just to be safe. */ 006706 assert( j < ArraySize(pPage->apOvfl)-1 ); 006707 pPage->apOvfl[j] = pCell; 006708 pPage->aiOvfl[j] = (u16)i; 006709 006710 /* When multiple overflows occur, they are always sequential and in 006711 ** sorted order. This invariants arise because multiple overflows can 006712 ** only occur when inserting divider cells into the parent page during 006713 ** balancing, and the dividers are adjacent and sorted. 006714 */ 006715 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 006716 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 006717 }else{ 006718 int rc = sqlite3PagerWrite(pPage->pDbPage); 006719 if( rc!=SQLITE_OK ){ 006720 *pRC = rc; 006721 return; 006722 } 006723 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 006724 data = pPage->aData; 006725 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 006726 rc = allocateSpace(pPage, sz, &idx); 006727 if( rc ){ *pRC = rc; return; } 006728 /* The allocateSpace() routine guarantees the following properties 006729 ** if it returns successfully */ 006730 assert( idx >= 0 ); 006731 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 006732 assert( idx+sz <= (int)pPage->pBt->usableSize ); 006733 pPage->nFree -= (u16)(2 + sz); 006734 if( iChild ){ 006735 /* In a corrupt database where an entry in the cell index section of 006736 ** a btree page has a value of 3 or less, the pCell value might point 006737 ** as many as 4 bytes in front of the start of the aData buffer for 006738 ** the source page. Make sure this does not cause problems by not 006739 ** reading the first 4 bytes */ 006740 memcpy(&data[idx+4], pCell+4, sz-4); 006741 put4byte(&data[idx], iChild); 006742 }else{ 006743 memcpy(&data[idx], pCell, sz); 006744 } 006745 pIns = pPage->aCellIdx + i*2; 006746 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 006747 put2byte(pIns, idx); 006748 pPage->nCell++; 006749 /* increment the cell count */ 006750 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 006751 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 006752 #ifndef SQLITE_OMIT_AUTOVACUUM 006753 if( pPage->pBt->autoVacuum ){ 006754 /* The cell may contain a pointer to an overflow page. If so, write 006755 ** the entry for the overflow page into the pointer map. 006756 */ 006757 ptrmapPutOvflPtr(pPage, pPage, pCell, pRC); 006758 } 006759 #endif 006760 } 006761 } 006762 006763 /* 006764 ** The following parameters determine how many adjacent pages get involved 006765 ** in a balancing operation. NN is the number of neighbors on either side 006766 ** of the page that participate in the balancing operation. NB is the 006767 ** total number of pages that participate, including the target page and 006768 ** NN neighbors on either side. 006769 ** 006770 ** The minimum value of NN is 1 (of course). Increasing NN above 1 006771 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 006772 ** in exchange for a larger degradation in INSERT and UPDATE performance. 006773 ** The value of NN appears to give the best results overall. 006774 ** 006775 ** (Later:) The description above makes it seem as if these values are 006776 ** tunable - as if you could change them and recompile and it would all work. 006777 ** But that is unlikely. NB has been 3 since the inception of SQLite and 006778 ** we have never tested any other value. 006779 */ 006780 #define NN 1 /* Number of neighbors on either side of pPage */ 006781 #define NB 3 /* (NN*2+1): Total pages involved in the balance */ 006782 006783 /* 006784 ** A CellArray object contains a cache of pointers and sizes for a 006785 ** consecutive sequence of cells that might be held on multiple pages. 006786 ** 006787 ** The cells in this array are the divider cell or cells from the pParent 006788 ** page plus up to three child pages. There are a total of nCell cells. 006789 ** 006790 ** pRef is a pointer to one of the pages that contributes cells. This is 006791 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize 006792 ** which should be common to all pages that contribute cells to this array. 006793 ** 006794 ** apCell[] and szCell[] hold, respectively, pointers to the start of each 006795 ** cell and the size of each cell. Some of the apCell[] pointers might refer 006796 ** to overflow cells. In other words, some apCel[] pointers might not point 006797 ** to content area of the pages. 006798 ** 006799 ** A szCell[] of zero means the size of that cell has not yet been computed. 006800 ** 006801 ** The cells come from as many as four different pages: 006802 ** 006803 ** ----------- 006804 ** | Parent | 006805 ** ----------- 006806 ** / | \ 006807 ** / | \ 006808 ** --------- --------- --------- 006809 ** |Child-1| |Child-2| |Child-3| 006810 ** --------- --------- --------- 006811 ** 006812 ** The order of cells is in the array is for an index btree is: 006813 ** 006814 ** 1. All cells from Child-1 in order 006815 ** 2. The first divider cell from Parent 006816 ** 3. All cells from Child-2 in order 006817 ** 4. The second divider cell from Parent 006818 ** 5. All cells from Child-3 in order 006819 ** 006820 ** For a table-btree (with rowids) the items 2 and 4 are empty because 006821 ** content exists only in leaves and there are no divider cells. 006822 ** 006823 ** For an index btree, the apEnd[] array holds pointer to the end of page 006824 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3, 006825 ** respectively. The ixNx[] array holds the number of cells contained in 006826 ** each of these 5 stages, and all stages to the left. Hence: 006827 ** 006828 ** ixNx[0] = Number of cells in Child-1. 006829 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider. 006830 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider. 006831 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells 006832 ** ixNx[4] = Total number of cells. 006833 ** 006834 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2] 006835 ** are used and they point to the leaf pages only, and the ixNx value are: 006836 ** 006837 ** ixNx[0] = Number of cells in Child-1. 006838 ** ixNx[1] = Number of cells in Child-1 and Child-2. 006839 ** ixNx[2] = Total number of cells. 006840 ** 006841 ** Sometimes when deleting, a child page can have zero cells. In those 006842 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[] 006843 ** entries, shift down. The end result is that each ixNx[] entry should 006844 ** be larger than the previous 006845 */ 006846 typedef struct CellArray CellArray; 006847 struct CellArray { 006848 int nCell; /* Number of cells in apCell[] */ 006849 MemPage *pRef; /* Reference page */ 006850 u8 **apCell; /* All cells begin balanced */ 006851 u16 *szCell; /* Local size of all cells in apCell[] */ 006852 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */ 006853 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */ 006854 }; 006855 006856 /* 006857 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been 006858 ** computed. 006859 */ 006860 static void populateCellCache(CellArray *p, int idx, int N){ 006861 assert( idx>=0 && idx+N<=p->nCell ); 006862 while( N>0 ){ 006863 assert( p->apCell[idx]!=0 ); 006864 if( p->szCell[idx]==0 ){ 006865 p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]); 006866 }else{ 006867 assert( CORRUPT_DB || 006868 p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) ); 006869 } 006870 idx++; 006871 N--; 006872 } 006873 } 006874 006875 /* 006876 ** Return the size of the Nth element of the cell array 006877 */ 006878 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ 006879 assert( N>=0 && N<p->nCell ); 006880 assert( p->szCell[N]==0 ); 006881 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); 006882 return p->szCell[N]; 006883 } 006884 static u16 cachedCellSize(CellArray *p, int N){ 006885 assert( N>=0 && N<p->nCell ); 006886 if( p->szCell[N] ) return p->szCell[N]; 006887 return computeCellSize(p, N); 006888 } 006889 006890 /* 006891 ** Array apCell[] contains pointers to nCell b-tree page cells. The 006892 ** szCell[] array contains the size in bytes of each cell. This function 006893 ** replaces the current contents of page pPg with the contents of the cell 006894 ** array. 006895 ** 006896 ** Some of the cells in apCell[] may currently be stored in pPg. This 006897 ** function works around problems caused by this by making a copy of any 006898 ** such cells before overwriting the page data. 006899 ** 006900 ** The MemPage.nFree field is invalidated by this function. It is the 006901 ** responsibility of the caller to set it correctly. 006902 */ 006903 static int rebuildPage( 006904 CellArray *pCArray, /* Content to be added to page pPg */ 006905 int iFirst, /* First cell in pCArray to use */ 006906 int nCell, /* Final number of cells on page */ 006907 MemPage *pPg /* The page to be reconstructed */ 006908 ){ 006909 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ 006910 u8 * const aData = pPg->aData; /* Pointer to data for pPg */ 006911 const int usableSize = pPg->pBt->usableSize; 006912 u8 * const pEnd = &aData[usableSize]; 006913 int i = iFirst; /* Which cell to copy from pCArray*/ 006914 u32 j; /* Start of cell content area */ 006915 int iEnd = i+nCell; /* Loop terminator */ 006916 u8 *pCellptr = pPg->aCellIdx; 006917 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 006918 u8 *pData; 006919 int k; /* Current slot in pCArray->apEnd[] */ 006920 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */ 006921 006922 assert( i<iEnd ); 006923 j = get2byte(&aData[hdr+5]); 006924 if( j>(u32)usableSize ){ j = 0; } 006925 memcpy(&pTmp[j], &aData[j], usableSize - j); 006926 006927 for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){} 006928 pSrcEnd = pCArray->apEnd[k]; 006929 006930 pData = pEnd; 006931 while( 1/*exit by break*/ ){ 006932 u8 *pCell = pCArray->apCell[i]; 006933 u16 sz = pCArray->szCell[i]; 006934 assert( sz>0 ); 006935 if( SQLITE_WITHIN(pCell,aData,pEnd) ){ 006936 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT; 006937 pCell = &pTmp[pCell - aData]; 006938 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd 006939 && (uptr)(pCell)<(uptr)pSrcEnd 006940 ){ 006941 return SQLITE_CORRUPT_BKPT; 006942 } 006943 006944 pData -= sz; 006945 put2byte(pCellptr, (pData - aData)); 006946 pCellptr += 2; 006947 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; 006948 memcpy(pData, pCell, sz); 006949 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); 006950 testcase( sz!=pPg->xCellSize(pPg,pCell) ); 006951 i++; 006952 if( i>=iEnd ) break; 006953 if( pCArray->ixNx[k]<=i ){ 006954 k++; 006955 pSrcEnd = pCArray->apEnd[k]; 006956 } 006957 } 006958 006959 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ 006960 pPg->nCell = nCell; 006961 pPg->nOverflow = 0; 006962 006963 put2byte(&aData[hdr+1], 0); 006964 put2byte(&aData[hdr+3], pPg->nCell); 006965 put2byte(&aData[hdr+5], pData - aData); 006966 aData[hdr+7] = 0x00; 006967 return SQLITE_OK; 006968 } 006969 006970 /* 006971 ** The pCArray objects contains pointers to b-tree cells and the cell sizes. 006972 ** This function attempts to add the cells stored in the array to page pPg. 006973 ** If it cannot (because the page needs to be defragmented before the cells 006974 ** will fit), non-zero is returned. Otherwise, if the cells are added 006975 ** successfully, zero is returned. 006976 ** 006977 ** Argument pCellptr points to the first entry in the cell-pointer array 006978 ** (part of page pPg) to populate. After cell apCell[0] is written to the 006979 ** page body, a 16-bit offset is written to pCellptr. And so on, for each 006980 ** cell in the array. It is the responsibility of the caller to ensure 006981 ** that it is safe to overwrite this part of the cell-pointer array. 006982 ** 006983 ** When this function is called, *ppData points to the start of the 006984 ** content area on page pPg. If the size of the content area is extended, 006985 ** *ppData is updated to point to the new start of the content area 006986 ** before returning. 006987 ** 006988 ** Finally, argument pBegin points to the byte immediately following the 006989 ** end of the space required by this page for the cell-pointer area (for 006990 ** all cells - not just those inserted by the current call). If the content 006991 ** area must be extended to before this point in order to accomodate all 006992 ** cells in apCell[], then the cells do not fit and non-zero is returned. 006993 */ 006994 static int pageInsertArray( 006995 MemPage *pPg, /* Page to add cells to */ 006996 u8 *pBegin, /* End of cell-pointer array */ 006997 u8 **ppData, /* IN/OUT: Page content-area pointer */ 006998 u8 *pCellptr, /* Pointer to cell-pointer area */ 006999 int iFirst, /* Index of first cell to add */ 007000 int nCell, /* Number of cells to add to pPg */ 007001 CellArray *pCArray /* Array of cells */ 007002 ){ 007003 int i = iFirst; /* Loop counter - cell index to insert */ 007004 u8 *aData = pPg->aData; /* Complete page */ 007005 u8 *pData = *ppData; /* Content area. A subset of aData[] */ 007006 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */ 007007 int k; /* Current slot in pCArray->apEnd[] */ 007008 u8 *pEnd; /* Maximum extent of cell data */ 007009 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ 007010 if( iEnd<=iFirst ) return 0; 007011 for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){} 007012 pEnd = pCArray->apEnd[k]; 007013 while( 1 /*Exit by break*/ ){ 007014 int sz, rc; 007015 u8 *pSlot; 007016 assert( pCArray->szCell[i]!=0 ); 007017 sz = pCArray->szCell[i]; 007018 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){ 007019 if( (pData - pBegin)<sz ) return 1; 007020 pData -= sz; 007021 pSlot = pData; 007022 } 007023 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed 007024 ** database. But they might for a corrupt database. Hence use memmove() 007025 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */ 007026 assert( (pSlot+sz)<=pCArray->apCell[i] 007027 || pSlot>=(pCArray->apCell[i]+sz) 007028 || CORRUPT_DB ); 007029 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd 007030 && (uptr)(pCArray->apCell[i])<(uptr)pEnd 007031 ){ 007032 assert( CORRUPT_DB ); 007033 (void)SQLITE_CORRUPT_BKPT; 007034 return 1; 007035 } 007036 memmove(pSlot, pCArray->apCell[i], sz); 007037 put2byte(pCellptr, (pSlot - aData)); 007038 pCellptr += 2; 007039 i++; 007040 if( i>=iEnd ) break; 007041 if( pCArray->ixNx[k]<=i ){ 007042 k++; 007043 pEnd = pCArray->apEnd[k]; 007044 } 007045 } 007046 *ppData = pData; 007047 return 0; 007048 } 007049 007050 /* 007051 ** The pCArray object contains pointers to b-tree cells and their sizes. 007052 ** 007053 ** This function adds the space associated with each cell in the array 007054 ** that is currently stored within the body of pPg to the pPg free-list. 007055 ** The cell-pointers and other fields of the page are not updated. 007056 ** 007057 ** This function returns the total number of cells added to the free-list. 007058 */ 007059 static int pageFreeArray( 007060 MemPage *pPg, /* Page to edit */ 007061 int iFirst, /* First cell to delete */ 007062 int nCell, /* Cells to delete */ 007063 CellArray *pCArray /* Array of cells */ 007064 ){ 007065 u8 * const aData = pPg->aData; 007066 u8 * const pEnd = &aData[pPg->pBt->usableSize]; 007067 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; 007068 int nRet = 0; 007069 int i; 007070 int iEnd = iFirst + nCell; 007071 u8 *pFree = 0; 007072 int szFree = 0; 007073 007074 for(i=iFirst; i<iEnd; i++){ 007075 u8 *pCell = pCArray->apCell[i]; 007076 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){ 007077 int sz; 007078 /* No need to use cachedCellSize() here. The sizes of all cells that 007079 ** are to be freed have already been computing while deciding which 007080 ** cells need freeing */ 007081 sz = pCArray->szCell[i]; assert( sz>0 ); 007082 if( pFree!=(pCell + sz) ){ 007083 if( pFree ){ 007084 assert( pFree>aData && (pFree - aData)<65536 ); 007085 freeSpace(pPg, (u16)(pFree - aData), szFree); 007086 } 007087 pFree = pCell; 007088 szFree = sz; 007089 if( pFree+sz>pEnd ) return 0; 007090 }else{ 007091 pFree = pCell; 007092 szFree += sz; 007093 } 007094 nRet++; 007095 } 007096 } 007097 if( pFree ){ 007098 assert( pFree>aData && (pFree - aData)<65536 ); 007099 freeSpace(pPg, (u16)(pFree - aData), szFree); 007100 } 007101 return nRet; 007102 } 007103 007104 /* 007105 ** pCArray contains pointers to and sizes of all cells in the page being 007106 ** balanced. The current page, pPg, has pPg->nCell cells starting with 007107 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells 007108 ** starting at apCell[iNew]. 007109 ** 007110 ** This routine makes the necessary adjustments to pPg so that it contains 007111 ** the correct cells after being balanced. 007112 ** 007113 ** The pPg->nFree field is invalid when this function returns. It is the 007114 ** responsibility of the caller to set it correctly. 007115 */ 007116 static int editPage( 007117 MemPage *pPg, /* Edit this page */ 007118 int iOld, /* Index of first cell currently on page */ 007119 int iNew, /* Index of new first cell on page */ 007120 int nNew, /* Final number of cells on page */ 007121 CellArray *pCArray /* Array of cells and sizes */ 007122 ){ 007123 u8 * const aData = pPg->aData; 007124 const int hdr = pPg->hdrOffset; 007125 u8 *pBegin = &pPg->aCellIdx[nNew * 2]; 007126 int nCell = pPg->nCell; /* Cells stored on pPg */ 007127 u8 *pData; 007128 u8 *pCellptr; 007129 int i; 007130 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; 007131 int iNewEnd = iNew + nNew; 007132 007133 #ifdef SQLITE_DEBUG 007134 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007135 memcpy(pTmp, aData, pPg->pBt->usableSize); 007136 #endif 007137 007138 /* Remove cells from the start and end of the page */ 007139 assert( nCell>=0 ); 007140 if( iOld<iNew ){ 007141 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); 007142 if( nShift>nCell ) return SQLITE_CORRUPT_BKPT; 007143 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); 007144 nCell -= nShift; 007145 } 007146 if( iNewEnd < iOldEnd ){ 007147 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); 007148 assert( nCell>=nTail ); 007149 nCell -= nTail; 007150 } 007151 007152 pData = &aData[get2byteNotZero(&aData[hdr+5])]; 007153 if( pData<pBegin ) goto editpage_fail; 007154 007155 /* Add cells to the start of the page */ 007156 if( iNew<iOld ){ 007157 int nAdd = MIN(nNew,iOld-iNew); 007158 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); 007159 assert( nAdd>=0 ); 007160 pCellptr = pPg->aCellIdx; 007161 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); 007162 if( pageInsertArray( 007163 pPg, pBegin, &pData, pCellptr, 007164 iNew, nAdd, pCArray 007165 ) ) goto editpage_fail; 007166 nCell += nAdd; 007167 } 007168 007169 /* Add any overflow cells */ 007170 for(i=0; i<pPg->nOverflow; i++){ 007171 int iCell = (iOld + pPg->aiOvfl[i]) - iNew; 007172 if( iCell>=0 && iCell<nNew ){ 007173 pCellptr = &pPg->aCellIdx[iCell * 2]; 007174 if( nCell>iCell ){ 007175 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); 007176 } 007177 nCell++; 007178 cachedCellSize(pCArray, iCell+iNew); 007179 if( pageInsertArray( 007180 pPg, pBegin, &pData, pCellptr, 007181 iCell+iNew, 1, pCArray 007182 ) ) goto editpage_fail; 007183 } 007184 } 007185 007186 /* Append cells to the end of the page */ 007187 assert( nCell>=0 ); 007188 pCellptr = &pPg->aCellIdx[nCell*2]; 007189 if( pageInsertArray( 007190 pPg, pBegin, &pData, pCellptr, 007191 iNew+nCell, nNew-nCell, pCArray 007192 ) ) goto editpage_fail; 007193 007194 pPg->nCell = nNew; 007195 pPg->nOverflow = 0; 007196 007197 put2byte(&aData[hdr+3], pPg->nCell); 007198 put2byte(&aData[hdr+5], pData - aData); 007199 007200 #ifdef SQLITE_DEBUG 007201 for(i=0; i<nNew && !CORRUPT_DB; i++){ 007202 u8 *pCell = pCArray->apCell[i+iNew]; 007203 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]); 007204 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){ 007205 pCell = &pTmp[pCell - aData]; 007206 } 007207 assert( 0==memcmp(pCell, &aData[iOff], 007208 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); 007209 } 007210 #endif 007211 007212 return SQLITE_OK; 007213 editpage_fail: 007214 /* Unable to edit this page. Rebuild it from scratch instead. */ 007215 populateCellCache(pCArray, iNew, nNew); 007216 return rebuildPage(pCArray, iNew, nNew, pPg); 007217 } 007218 007219 007220 #ifndef SQLITE_OMIT_QUICKBALANCE 007221 /* 007222 ** This version of balance() handles the common special case where 007223 ** a new entry is being inserted on the extreme right-end of the 007224 ** tree, in other words, when the new entry will become the largest 007225 ** entry in the tree. 007226 ** 007227 ** Instead of trying to balance the 3 right-most leaf pages, just add 007228 ** a new page to the right-hand side and put the one new entry in 007229 ** that page. This leaves the right side of the tree somewhat 007230 ** unbalanced. But odds are that we will be inserting new entries 007231 ** at the end soon afterwards so the nearly empty page will quickly 007232 ** fill up. On average. 007233 ** 007234 ** pPage is the leaf page which is the right-most page in the tree. 007235 ** pParent is its parent. pPage must have a single overflow entry 007236 ** which is also the right-most entry on the page. 007237 ** 007238 ** The pSpace buffer is used to store a temporary copy of the divider 007239 ** cell that will be inserted into pParent. Such a cell consists of a 4 007240 ** byte page number followed by a variable length integer. In other 007241 ** words, at most 13 bytes. Hence the pSpace buffer must be at 007242 ** least 13 bytes in size. 007243 */ 007244 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 007245 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 007246 MemPage *pNew; /* Newly allocated page */ 007247 int rc; /* Return Code */ 007248 Pgno pgnoNew; /* Page number of pNew */ 007249 007250 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007251 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007252 assert( pPage->nOverflow==1 ); 007253 007254 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */ 007255 assert( pPage->nFree>=0 ); 007256 assert( pParent->nFree>=0 ); 007257 007258 /* Allocate a new page. This page will become the right-sibling of 007259 ** pPage. Make the parent page writable, so that the new divider cell 007260 ** may be inserted. If both these operations are successful, proceed. 007261 */ 007262 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 007263 007264 if( rc==SQLITE_OK ){ 007265 007266 u8 *pOut = &pSpace[4]; 007267 u8 *pCell = pPage->apOvfl[0]; 007268 u16 szCell = pPage->xCellSize(pPage, pCell); 007269 u8 *pStop; 007270 CellArray b; 007271 007272 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 007273 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 007274 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 007275 b.nCell = 1; 007276 b.pRef = pPage; 007277 b.apCell = &pCell; 007278 b.szCell = &szCell; 007279 b.apEnd[0] = pPage->aDataEnd; 007280 b.ixNx[0] = 2; 007281 rc = rebuildPage(&b, 0, 1, pNew); 007282 if( NEVER(rc) ){ 007283 releasePage(pNew); 007284 return rc; 007285 } 007286 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; 007287 007288 /* If this is an auto-vacuum database, update the pointer map 007289 ** with entries for the new page, and any pointer from the 007290 ** cell on the page to an overflow page. If either of these 007291 ** operations fails, the return code is set, but the contents 007292 ** of the parent page are still manipulated by thh code below. 007293 ** That is Ok, at this point the parent page is guaranteed to 007294 ** be marked as dirty. Returning an error code will cause a 007295 ** rollback, undoing any changes made to the parent page. 007296 */ 007297 if( ISAUTOVACUUM ){ 007298 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 007299 if( szCell>pNew->minLocal ){ 007300 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc); 007301 } 007302 } 007303 007304 /* Create a divider cell to insert into pParent. The divider cell 007305 ** consists of a 4-byte page number (the page number of pPage) and 007306 ** a variable length key value (which must be the same value as the 007307 ** largest key on pPage). 007308 ** 007309 ** To find the largest key value on pPage, first find the right-most 007310 ** cell on pPage. The first two fields of this cell are the 007311 ** record-length (a variable length integer at most 32-bits in size) 007312 ** and the key value (a variable length integer, may have any value). 007313 ** The first of the while(...) loops below skips over the record-length 007314 ** field. The second while(...) loop copies the key value from the 007315 ** cell on pPage into the pSpace buffer. 007316 */ 007317 pCell = findCell(pPage, pPage->nCell-1); 007318 pStop = &pCell[9]; 007319 while( (*(pCell++)&0x80) && pCell<pStop ); 007320 pStop = &pCell[9]; 007321 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 007322 007323 /* Insert the new divider cell into pParent. */ 007324 if( rc==SQLITE_OK ){ 007325 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 007326 0, pPage->pgno, &rc); 007327 } 007328 007329 /* Set the right-child pointer of pParent to point to the new page. */ 007330 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 007331 007332 /* Release the reference to the new page. */ 007333 releasePage(pNew); 007334 } 007335 007336 return rc; 007337 } 007338 #endif /* SQLITE_OMIT_QUICKBALANCE */ 007339 007340 #if 0 007341 /* 007342 ** This function does not contribute anything to the operation of SQLite. 007343 ** it is sometimes activated temporarily while debugging code responsible 007344 ** for setting pointer-map entries. 007345 */ 007346 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 007347 int i, j; 007348 for(i=0; i<nPage; i++){ 007349 Pgno n; 007350 u8 e; 007351 MemPage *pPage = apPage[i]; 007352 BtShared *pBt = pPage->pBt; 007353 assert( pPage->isInit ); 007354 007355 for(j=0; j<pPage->nCell; j++){ 007356 CellInfo info; 007357 u8 *z; 007358 007359 z = findCell(pPage, j); 007360 pPage->xParseCell(pPage, z, &info); 007361 if( info.nLocal<info.nPayload ){ 007362 Pgno ovfl = get4byte(&z[info.nSize-4]); 007363 ptrmapGet(pBt, ovfl, &e, &n); 007364 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 007365 } 007366 if( !pPage->leaf ){ 007367 Pgno child = get4byte(z); 007368 ptrmapGet(pBt, child, &e, &n); 007369 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 007370 } 007371 } 007372 if( !pPage->leaf ){ 007373 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 007374 ptrmapGet(pBt, child, &e, &n); 007375 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 007376 } 007377 } 007378 return 1; 007379 } 007380 #endif 007381 007382 /* 007383 ** This function is used to copy the contents of the b-tree node stored 007384 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 007385 ** the pointer-map entries for each child page are updated so that the 007386 ** parent page stored in the pointer map is page pTo. If pFrom contained 007387 ** any cells with overflow page pointers, then the corresponding pointer 007388 ** map entries are also updated so that the parent page is page pTo. 007389 ** 007390 ** If pFrom is currently carrying any overflow cells (entries in the 007391 ** MemPage.apOvfl[] array), they are not copied to pTo. 007392 ** 007393 ** Before returning, page pTo is reinitialized using btreeInitPage(). 007394 ** 007395 ** The performance of this function is not critical. It is only used by 007396 ** the balance_shallower() and balance_deeper() procedures, neither of 007397 ** which are called often under normal circumstances. 007398 */ 007399 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 007400 if( (*pRC)==SQLITE_OK ){ 007401 BtShared * const pBt = pFrom->pBt; 007402 u8 * const aFrom = pFrom->aData; 007403 u8 * const aTo = pTo->aData; 007404 int const iFromHdr = pFrom->hdrOffset; 007405 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 007406 int rc; 007407 int iData; 007408 007409 007410 assert( pFrom->isInit ); 007411 assert( pFrom->nFree>=iToHdr ); 007412 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 007413 007414 /* Copy the b-tree node content from page pFrom to page pTo. */ 007415 iData = get2byte(&aFrom[iFromHdr+5]); 007416 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 007417 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 007418 007419 /* Reinitialize page pTo so that the contents of the MemPage structure 007420 ** match the new data. The initialization of pTo can actually fail under 007421 ** fairly obscure circumstances, even though it is a copy of initialized 007422 ** page pFrom. 007423 */ 007424 pTo->isInit = 0; 007425 rc = btreeInitPage(pTo); 007426 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo); 007427 if( rc!=SQLITE_OK ){ 007428 *pRC = rc; 007429 return; 007430 } 007431 007432 /* If this is an auto-vacuum database, update the pointer-map entries 007433 ** for any b-tree or overflow pages that pTo now contains the pointers to. 007434 */ 007435 if( ISAUTOVACUUM ){ 007436 *pRC = setChildPtrmaps(pTo); 007437 } 007438 } 007439 } 007440 007441 /* 007442 ** This routine redistributes cells on the iParentIdx'th child of pParent 007443 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 007444 ** same amount of free space. Usually a single sibling on either side of the 007445 ** page are used in the balancing, though both siblings might come from one 007446 ** side if the page is the first or last child of its parent. If the page 007447 ** has fewer than 2 siblings (something which can only happen if the page 007448 ** is a root page or a child of a root page) then all available siblings 007449 ** participate in the balancing. 007450 ** 007451 ** The number of siblings of the page might be increased or decreased by 007452 ** one or two in an effort to keep pages nearly full but not over full. 007453 ** 007454 ** Note that when this routine is called, some of the cells on the page 007455 ** might not actually be stored in MemPage.aData[]. This can happen 007456 ** if the page is overfull. This routine ensures that all cells allocated 007457 ** to the page and its siblings fit into MemPage.aData[] before returning. 007458 ** 007459 ** In the course of balancing the page and its siblings, cells may be 007460 ** inserted into or removed from the parent page (pParent). Doing so 007461 ** may cause the parent page to become overfull or underfull. If this 007462 ** happens, it is the responsibility of the caller to invoke the correct 007463 ** balancing routine to fix this problem (see the balance() routine). 007464 ** 007465 ** If this routine fails for any reason, it might leave the database 007466 ** in a corrupted state. So if this routine fails, the database should 007467 ** be rolled back. 007468 ** 007469 ** The third argument to this function, aOvflSpace, is a pointer to a 007470 ** buffer big enough to hold one page. If while inserting cells into the parent 007471 ** page (pParent) the parent page becomes overfull, this buffer is 007472 ** used to store the parent's overflow cells. Because this function inserts 007473 ** a maximum of four divider cells into the parent page, and the maximum 007474 ** size of a cell stored within an internal node is always less than 1/4 007475 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 007476 ** enough for all overflow cells. 007477 ** 007478 ** If aOvflSpace is set to a null pointer, this function returns 007479 ** SQLITE_NOMEM. 007480 */ 007481 static int balance_nonroot( 007482 MemPage *pParent, /* Parent page of siblings being balanced */ 007483 int iParentIdx, /* Index of "the page" in pParent */ 007484 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 007485 int isRoot, /* True if pParent is a root-page */ 007486 int bBulk /* True if this call is part of a bulk load */ 007487 ){ 007488 BtShared *pBt; /* The whole database */ 007489 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 007490 int nNew = 0; /* Number of pages in apNew[] */ 007491 int nOld; /* Number of pages in apOld[] */ 007492 int i, j, k; /* Loop counters */ 007493 int nxDiv; /* Next divider slot in pParent->aCell[] */ 007494 int rc = SQLITE_OK; /* The return code */ 007495 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 007496 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 007497 int usableSpace; /* Bytes in pPage beyond the header */ 007498 int pageFlags; /* Value of pPage->aData[0] */ 007499 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 007500 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 007501 int szScratch; /* Size of scratch memory requested */ 007502 MemPage *apOld[NB]; /* pPage and up to two siblings */ 007503 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 007504 u8 *pRight; /* Location in parent of right-sibling pointer */ 007505 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 007506 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ 007507 int cntOld[NB+2]; /* Old index in b.apCell[] */ 007508 int szNew[NB+2]; /* Combined size of cells placed on i-th page */ 007509 u8 *aSpace1; /* Space for copies of dividers cells */ 007510 Pgno pgno; /* Temp var to store a page number in */ 007511 u8 abDone[NB+2]; /* True after i'th new page is populated */ 007512 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ 007513 Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ 007514 u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ 007515 CellArray b; /* Parsed information on cells being balanced */ 007516 007517 memset(abDone, 0, sizeof(abDone)); 007518 b.nCell = 0; 007519 b.apCell = 0; 007520 pBt = pParent->pBt; 007521 assert( sqlite3_mutex_held(pBt->mutex) ); 007522 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007523 007524 /* At this point pParent may have at most one overflow cell. And if 007525 ** this overflow cell is present, it must be the cell with 007526 ** index iParentIdx. This scenario comes about when this function 007527 ** is called (indirectly) from sqlite3BtreeDelete(). 007528 */ 007529 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 007530 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx ); 007531 007532 if( !aOvflSpace ){ 007533 return SQLITE_NOMEM_BKPT; 007534 } 007535 assert( pParent->nFree>=0 ); 007536 007537 /* Find the sibling pages to balance. Also locate the cells in pParent 007538 ** that divide the siblings. An attempt is made to find NN siblings on 007539 ** either side of pPage. More siblings are taken from one side, however, 007540 ** if there are fewer than NN siblings on the other side. If pParent 007541 ** has NB or fewer children then all children of pParent are taken. 007542 ** 007543 ** This loop also drops the divider cells from the parent page. This 007544 ** way, the remainder of the function does not have to deal with any 007545 ** overflow cells in the parent page, since if any existed they will 007546 ** have already been removed. 007547 */ 007548 i = pParent->nOverflow + pParent->nCell; 007549 if( i<2 ){ 007550 nxDiv = 0; 007551 }else{ 007552 assert( bBulk==0 || bBulk==1 ); 007553 if( iParentIdx==0 ){ 007554 nxDiv = 0; 007555 }else if( iParentIdx==i ){ 007556 nxDiv = i-2+bBulk; 007557 }else{ 007558 nxDiv = iParentIdx-1; 007559 } 007560 i = 2-bBulk; 007561 } 007562 nOld = i+1; 007563 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 007564 pRight = &pParent->aData[pParent->hdrOffset+8]; 007565 }else{ 007566 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 007567 } 007568 pgno = get4byte(pRight); 007569 while( 1 ){ 007570 rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0); 007571 if( rc ){ 007572 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 007573 goto balance_cleanup; 007574 } 007575 if( apOld[i]->nFree<0 ){ 007576 rc = btreeComputeFreeSpace(apOld[i]); 007577 if( rc ){ 007578 memset(apOld, 0, (i)*sizeof(MemPage*)); 007579 goto balance_cleanup; 007580 } 007581 } 007582 if( (i--)==0 ) break; 007583 007584 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ 007585 apDiv[i] = pParent->apOvfl[0]; 007586 pgno = get4byte(apDiv[i]); 007587 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 007588 pParent->nOverflow = 0; 007589 }else{ 007590 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 007591 pgno = get4byte(apDiv[i]); 007592 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 007593 007594 /* Drop the cell from the parent page. apDiv[i] still points to 007595 ** the cell within the parent, even though it has been dropped. 007596 ** This is safe because dropping a cell only overwrites the first 007597 ** four bytes of it, and this function does not need the first 007598 ** four bytes of the divider cell. So the pointer is safe to use 007599 ** later on. 007600 ** 007601 ** But not if we are in secure-delete mode. In secure-delete mode, 007602 ** the dropCell() routine will overwrite the entire cell with zeroes. 007603 ** In this case, temporarily copy the cell into the aOvflSpace[] 007604 ** buffer. It will be copied out again as soon as the aSpace[] buffer 007605 ** is allocated. */ 007606 if( pBt->btsFlags & BTS_FAST_SECURE ){ 007607 int iOff; 007608 007609 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 007610 if( (iOff+szNew[i])>(int)pBt->usableSize ){ 007611 rc = SQLITE_CORRUPT_BKPT; 007612 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 007613 goto balance_cleanup; 007614 }else{ 007615 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 007616 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 007617 } 007618 } 007619 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 007620 } 007621 } 007622 007623 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 007624 ** alignment */ 007625 nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl)); 007626 nMaxCells = (nMaxCells + 3)&~3; 007627 007628 /* 007629 ** Allocate space for memory structures 007630 */ 007631 szScratch = 007632 nMaxCells*sizeof(u8*) /* b.apCell */ 007633 + nMaxCells*sizeof(u16) /* b.szCell */ 007634 + pBt->pageSize; /* aSpace1 */ 007635 007636 assert( szScratch<=7*(int)pBt->pageSize ); 007637 b.apCell = sqlite3StackAllocRaw(0, szScratch ); 007638 if( b.apCell==0 ){ 007639 rc = SQLITE_NOMEM_BKPT; 007640 goto balance_cleanup; 007641 } 007642 b.szCell = (u16*)&b.apCell[nMaxCells]; 007643 aSpace1 = (u8*)&b.szCell[nMaxCells]; 007644 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 007645 007646 /* 007647 ** Load pointers to all cells on sibling pages and the divider cells 007648 ** into the local b.apCell[] array. Make copies of the divider cells 007649 ** into space obtained from aSpace1[]. The divider cells have already 007650 ** been removed from pParent. 007651 ** 007652 ** If the siblings are on leaf pages, then the child pointers of the 007653 ** divider cells are stripped from the cells before they are copied 007654 ** into aSpace1[]. In this way, all cells in b.apCell[] are without 007655 ** child pointers. If siblings are not leaves, then all cell in 007656 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] 007657 ** are alike. 007658 ** 007659 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 007660 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 007661 */ 007662 b.pRef = apOld[0]; 007663 leafCorrection = b.pRef->leaf*4; 007664 leafData = b.pRef->intKeyLeaf; 007665 for(i=0; i<nOld; i++){ 007666 MemPage *pOld = apOld[i]; 007667 int limit = pOld->nCell; 007668 u8 *aData = pOld->aData; 007669 u16 maskPage = pOld->maskPage; 007670 u8 *piCell = aData + pOld->cellOffset; 007671 u8 *piEnd; 007672 VVA_ONLY( int nCellAtStart = b.nCell; ) 007673 007674 /* Verify that all sibling pages are of the same "type" (table-leaf, 007675 ** table-interior, index-leaf, or index-interior). 007676 */ 007677 if( pOld->aData[0]!=apOld[0]->aData[0] ){ 007678 rc = SQLITE_CORRUPT_BKPT; 007679 goto balance_cleanup; 007680 } 007681 007682 /* Load b.apCell[] with pointers to all cells in pOld. If pOld 007683 ** contains overflow cells, include them in the b.apCell[] array 007684 ** in the correct spot. 007685 ** 007686 ** Note that when there are multiple overflow cells, it is always the 007687 ** case that they are sequential and adjacent. This invariant arises 007688 ** because multiple overflows can only occurs when inserting divider 007689 ** cells into a parent on a prior balance, and divider cells are always 007690 ** adjacent and are inserted in order. There is an assert() tagged 007691 ** with "NOTE 1" in the overflow cell insertion loop to prove this 007692 ** invariant. 007693 ** 007694 ** This must be done in advance. Once the balance starts, the cell 007695 ** offset section of the btree page will be overwritten and we will no 007696 ** long be able to find the cells if a pointer to each cell is not saved 007697 ** first. 007698 */ 007699 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow)); 007700 if( pOld->nOverflow>0 ){ 007701 if( NEVER(limit<pOld->aiOvfl[0]) ){ 007702 rc = SQLITE_CORRUPT_BKPT; 007703 goto balance_cleanup; 007704 } 007705 limit = pOld->aiOvfl[0]; 007706 for(j=0; j<limit; j++){ 007707 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 007708 piCell += 2; 007709 b.nCell++; 007710 } 007711 for(k=0; k<pOld->nOverflow; k++){ 007712 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ 007713 b.apCell[b.nCell] = pOld->apOvfl[k]; 007714 b.nCell++; 007715 } 007716 } 007717 piEnd = aData + pOld->cellOffset + 2*pOld->nCell; 007718 while( piCell<piEnd ){ 007719 assert( b.nCell<nMaxCells ); 007720 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 007721 piCell += 2; 007722 b.nCell++; 007723 } 007724 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) ); 007725 007726 cntOld[i] = b.nCell; 007727 if( i<nOld-1 && !leafData){ 007728 u16 sz = (u16)szNew[i]; 007729 u8 *pTemp; 007730 assert( b.nCell<nMaxCells ); 007731 b.szCell[b.nCell] = sz; 007732 pTemp = &aSpace1[iSpace1]; 007733 iSpace1 += sz; 007734 assert( sz<=pBt->maxLocal+23 ); 007735 assert( iSpace1 <= (int)pBt->pageSize ); 007736 memcpy(pTemp, apDiv[i], sz); 007737 b.apCell[b.nCell] = pTemp+leafCorrection; 007738 assert( leafCorrection==0 || leafCorrection==4 ); 007739 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; 007740 if( !pOld->leaf ){ 007741 assert( leafCorrection==0 ); 007742 assert( pOld->hdrOffset==0 ); 007743 /* The right pointer of the child page pOld becomes the left 007744 ** pointer of the divider cell */ 007745 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); 007746 }else{ 007747 assert( leafCorrection==4 ); 007748 while( b.szCell[b.nCell]<4 ){ 007749 /* Do not allow any cells smaller than 4 bytes. If a smaller cell 007750 ** does exist, pad it with 0x00 bytes. */ 007751 assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); 007752 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); 007753 aSpace1[iSpace1++] = 0x00; 007754 b.szCell[b.nCell]++; 007755 } 007756 } 007757 b.nCell++; 007758 } 007759 } 007760 007761 /* 007762 ** Figure out the number of pages needed to hold all b.nCell cells. 007763 ** Store this number in "k". Also compute szNew[] which is the total 007764 ** size of all cells on the i-th page and cntNew[] which is the index 007765 ** in b.apCell[] of the cell that divides page i from page i+1. 007766 ** cntNew[k] should equal b.nCell. 007767 ** 007768 ** Values computed by this block: 007769 ** 007770 ** k: The total number of sibling pages 007771 ** szNew[i]: Spaced used on the i-th sibling page. 007772 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to 007773 ** the right of the i-th sibling page. 007774 ** usableSpace: Number of bytes of space available on each sibling. 007775 ** 007776 */ 007777 usableSpace = pBt->usableSize - 12 + leafCorrection; 007778 for(i=k=0; i<nOld; i++, k++){ 007779 MemPage *p = apOld[i]; 007780 b.apEnd[k] = p->aDataEnd; 007781 b.ixNx[k] = cntOld[i]; 007782 if( k && b.ixNx[k]==b.ixNx[k-1] ){ 007783 k--; /* Omit b.ixNx[] entry for child pages with no cells */ 007784 } 007785 if( !leafData ){ 007786 k++; 007787 b.apEnd[k] = pParent->aDataEnd; 007788 b.ixNx[k] = cntOld[i]+1; 007789 } 007790 assert( p->nFree>=0 ); 007791 szNew[i] = usableSpace - p->nFree; 007792 for(j=0; j<p->nOverflow; j++){ 007793 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); 007794 } 007795 cntNew[i] = cntOld[i]; 007796 } 007797 k = nOld; 007798 for(i=0; i<k; i++){ 007799 int sz; 007800 while( szNew[i]>usableSpace ){ 007801 if( i+1>=k ){ 007802 k = i+2; 007803 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 007804 szNew[k-1] = 0; 007805 cntNew[k-1] = b.nCell; 007806 } 007807 sz = 2 + cachedCellSize(&b, cntNew[i]-1); 007808 szNew[i] -= sz; 007809 if( !leafData ){ 007810 if( cntNew[i]<b.nCell ){ 007811 sz = 2 + cachedCellSize(&b, cntNew[i]); 007812 }else{ 007813 sz = 0; 007814 } 007815 } 007816 szNew[i+1] += sz; 007817 cntNew[i]--; 007818 } 007819 while( cntNew[i]<b.nCell ){ 007820 sz = 2 + cachedCellSize(&b, cntNew[i]); 007821 if( szNew[i]+sz>usableSpace ) break; 007822 szNew[i] += sz; 007823 cntNew[i]++; 007824 if( !leafData ){ 007825 if( cntNew[i]<b.nCell ){ 007826 sz = 2 + cachedCellSize(&b, cntNew[i]); 007827 }else{ 007828 sz = 0; 007829 } 007830 } 007831 szNew[i+1] -= sz; 007832 } 007833 if( cntNew[i]>=b.nCell ){ 007834 k = i+1; 007835 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){ 007836 rc = SQLITE_CORRUPT_BKPT; 007837 goto balance_cleanup; 007838 } 007839 } 007840 007841 /* 007842 ** The packing computed by the previous block is biased toward the siblings 007843 ** on the left side (siblings with smaller keys). The left siblings are 007844 ** always nearly full, while the right-most sibling might be nearly empty. 007845 ** The next block of code attempts to adjust the packing of siblings to 007846 ** get a better balance. 007847 ** 007848 ** This adjustment is more than an optimization. The packing above might 007849 ** be so out of balance as to be illegal. For example, the right-most 007850 ** sibling might be completely empty. This adjustment is not optional. 007851 */ 007852 for(i=k-1; i>0; i--){ 007853 int szRight = szNew[i]; /* Size of sibling on the right */ 007854 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 007855 int r; /* Index of right-most cell in left sibling */ 007856 int d; /* Index of first cell to the left of right sibling */ 007857 007858 r = cntNew[i-1] - 1; 007859 d = r + 1 - leafData; 007860 (void)cachedCellSize(&b, d); 007861 do{ 007862 assert( d<nMaxCells ); 007863 assert( r<nMaxCells ); 007864 (void)cachedCellSize(&b, r); 007865 if( szRight!=0 007866 && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){ 007867 break; 007868 } 007869 szRight += b.szCell[d] + 2; 007870 szLeft -= b.szCell[r] + 2; 007871 cntNew[i-1] = r; 007872 r--; 007873 d--; 007874 }while( r>=0 ); 007875 szNew[i] = szRight; 007876 szNew[i-1] = szLeft; 007877 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){ 007878 rc = SQLITE_CORRUPT_BKPT; 007879 goto balance_cleanup; 007880 } 007881 } 007882 007883 /* Sanity check: For a non-corrupt database file one of the follwing 007884 ** must be true: 007885 ** (1) We found one or more cells (cntNew[0])>0), or 007886 ** (2) pPage is a virtual root page. A virtual root page is when 007887 ** the real root page is page 1 and we are the only child of 007888 ** that page. 007889 */ 007890 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); 007891 TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n", 007892 apOld[0]->pgno, apOld[0]->nCell, 007893 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, 007894 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 007895 )); 007896 007897 /* 007898 ** Allocate k new pages. Reuse old pages where possible. 007899 */ 007900 pageFlags = apOld[0]->aData[0]; 007901 for(i=0; i<k; i++){ 007902 MemPage *pNew; 007903 if( i<nOld ){ 007904 pNew = apNew[i] = apOld[i]; 007905 apOld[i] = 0; 007906 rc = sqlite3PagerWrite(pNew->pDbPage); 007907 nNew++; 007908 if( rc ) goto balance_cleanup; 007909 }else{ 007910 assert( i>0 ); 007911 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); 007912 if( rc ) goto balance_cleanup; 007913 zeroPage(pNew, pageFlags); 007914 apNew[i] = pNew; 007915 nNew++; 007916 cntOld[i] = b.nCell; 007917 007918 /* Set the pointer-map entry for the new sibling page. */ 007919 if( ISAUTOVACUUM ){ 007920 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 007921 if( rc!=SQLITE_OK ){ 007922 goto balance_cleanup; 007923 } 007924 } 007925 } 007926 } 007927 007928 /* 007929 ** Reassign page numbers so that the new pages are in ascending order. 007930 ** This helps to keep entries in the disk file in order so that a scan 007931 ** of the table is closer to a linear scan through the file. That in turn 007932 ** helps the operating system to deliver pages from the disk more rapidly. 007933 ** 007934 ** An O(n^2) insertion sort algorithm is used, but since n is never more 007935 ** than (NB+2) (a small constant), that should not be a problem. 007936 ** 007937 ** When NB==3, this one optimization makes the database about 25% faster 007938 ** for large insertions and deletions. 007939 */ 007940 for(i=0; i<nNew; i++){ 007941 aPgOrder[i] = aPgno[i] = apNew[i]->pgno; 007942 aPgFlags[i] = apNew[i]->pDbPage->flags; 007943 for(j=0; j<i; j++){ 007944 if( aPgno[j]==aPgno[i] ){ 007945 /* This branch is taken if the set of sibling pages somehow contains 007946 ** duplicate entries. This can happen if the database is corrupt. 007947 ** It would be simpler to detect this as part of the loop below, but 007948 ** we do the detection here in order to avoid populating the pager 007949 ** cache with two separate objects associated with the same 007950 ** page number. */ 007951 assert( CORRUPT_DB ); 007952 rc = SQLITE_CORRUPT_BKPT; 007953 goto balance_cleanup; 007954 } 007955 } 007956 } 007957 for(i=0; i<nNew; i++){ 007958 int iBest = 0; /* aPgno[] index of page number to use */ 007959 for(j=1; j<nNew; j++){ 007960 if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j; 007961 } 007962 pgno = aPgOrder[iBest]; 007963 aPgOrder[iBest] = 0xffffffff; 007964 if( iBest!=i ){ 007965 if( iBest>i ){ 007966 sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0); 007967 } 007968 sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]); 007969 apNew[i]->pgno = pgno; 007970 } 007971 } 007972 007973 TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) " 007974 "%d(%d nc=%d) %d(%d nc=%d)\n", 007975 apNew[0]->pgno, szNew[0], cntNew[0], 007976 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 007977 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, 007978 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 007979 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, 007980 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 007981 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, 007982 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, 007983 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 007984 )); 007985 007986 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007987 assert( nNew>=1 && nNew<=ArraySize(apNew) ); 007988 assert( apNew[nNew-1]!=0 ); 007989 put4byte(pRight, apNew[nNew-1]->pgno); 007990 007991 /* If the sibling pages are not leaves, ensure that the right-child pointer 007992 ** of the right-most new sibling page is set to the value that was 007993 ** originally in the same field of the right-most old sibling page. */ 007994 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ 007995 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; 007996 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); 007997 } 007998 007999 /* Make any required updates to pointer map entries associated with 008000 ** cells stored on sibling pages following the balance operation. Pointer 008001 ** map entries associated with divider cells are set by the insertCell() 008002 ** routine. The associated pointer map entries are: 008003 ** 008004 ** a) if the cell contains a reference to an overflow chain, the 008005 ** entry associated with the first page in the overflow chain, and 008006 ** 008007 ** b) if the sibling pages are not leaves, the child page associated 008008 ** with the cell. 008009 ** 008010 ** If the sibling pages are not leaves, then the pointer map entry 008011 ** associated with the right-child of each sibling may also need to be 008012 ** updated. This happens below, after the sibling pages have been 008013 ** populated, not here. 008014 */ 008015 if( ISAUTOVACUUM ){ 008016 MemPage *pOld; 008017 MemPage *pNew = pOld = apNew[0]; 008018 int cntOldNext = pNew->nCell + pNew->nOverflow; 008019 int iNew = 0; 008020 int iOld = 0; 008021 008022 for(i=0; i<b.nCell; i++){ 008023 u8 *pCell = b.apCell[i]; 008024 while( i==cntOldNext ){ 008025 iOld++; 008026 assert( iOld<nNew || iOld<nOld ); 008027 assert( iOld>=0 && iOld<NB ); 008028 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld]; 008029 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; 008030 } 008031 if( i==cntNew[iNew] ){ 008032 pNew = apNew[++iNew]; 008033 if( !leafData ) continue; 008034 } 008035 008036 /* Cell pCell is destined for new sibling page pNew. Originally, it 008037 ** was either part of sibling page iOld (possibly an overflow cell), 008038 ** or else the divider cell to the left of sibling page iOld. So, 008039 ** if sibling page iOld had the same page number as pNew, and if 008040 ** pCell really was a part of sibling page iOld (not a divider or 008041 ** overflow cell), we can skip updating the pointer map entries. */ 008042 if( iOld>=nNew 008043 || pNew->pgno!=aPgno[iOld] 008044 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd) 008045 ){ 008046 if( !leafCorrection ){ 008047 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); 008048 } 008049 if( cachedCellSize(&b,i)>pNew->minLocal ){ 008050 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc); 008051 } 008052 if( rc ) goto balance_cleanup; 008053 } 008054 } 008055 } 008056 008057 /* Insert new divider cells into pParent. */ 008058 for(i=0; i<nNew-1; i++){ 008059 u8 *pCell; 008060 u8 *pTemp; 008061 int sz; 008062 MemPage *pNew = apNew[i]; 008063 j = cntNew[i]; 008064 008065 assert( j<nMaxCells ); 008066 assert( b.apCell[j]!=0 ); 008067 pCell = b.apCell[j]; 008068 sz = b.szCell[j] + leafCorrection; 008069 pTemp = &aOvflSpace[iOvflSpace]; 008070 if( !pNew->leaf ){ 008071 memcpy(&pNew->aData[8], pCell, 4); 008072 }else if( leafData ){ 008073 /* If the tree is a leaf-data tree, and the siblings are leaves, 008074 ** then there is no divider cell in b.apCell[]. Instead, the divider 008075 ** cell consists of the integer key for the right-most cell of 008076 ** the sibling-page assembled above only. 008077 */ 008078 CellInfo info; 008079 j--; 008080 pNew->xParseCell(pNew, b.apCell[j], &info); 008081 pCell = pTemp; 008082 sz = 4 + putVarint(&pCell[4], info.nKey); 008083 pTemp = 0; 008084 }else{ 008085 pCell -= 4; 008086 /* Obscure case for non-leaf-data trees: If the cell at pCell was 008087 ** previously stored on a leaf node, and its reported size was 4 008088 ** bytes, then it may actually be smaller than this 008089 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 008090 ** any cell). But it is important to pass the correct size to 008091 ** insertCell(), so reparse the cell now. 008092 ** 008093 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)" 008094 ** and WITHOUT ROWID tables with exactly one column which is the 008095 ** primary key. 008096 */ 008097 if( b.szCell[j]==4 ){ 008098 assert(leafCorrection==4); 008099 sz = pParent->xCellSize(pParent, pCell); 008100 } 008101 } 008102 iOvflSpace += sz; 008103 assert( sz<=pBt->maxLocal+23 ); 008104 assert( iOvflSpace <= (int)pBt->pageSize ); 008105 insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc); 008106 if( rc!=SQLITE_OK ) goto balance_cleanup; 008107 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008108 } 008109 008110 /* Now update the actual sibling pages. The order in which they are updated 008111 ** is important, as this code needs to avoid disrupting any page from which 008112 ** cells may still to be read. In practice, this means: 008113 ** 008114 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) 008115 ** then it is not safe to update page apNew[iPg] until after 008116 ** the left-hand sibling apNew[iPg-1] has been updated. 008117 ** 008118 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) 008119 ** then it is not safe to update page apNew[iPg] until after 008120 ** the right-hand sibling apNew[iPg+1] has been updated. 008121 ** 008122 ** If neither of the above apply, the page is safe to update. 008123 ** 008124 ** The iPg value in the following loop starts at nNew-1 goes down 008125 ** to 0, then back up to nNew-1 again, thus making two passes over 008126 ** the pages. On the initial downward pass, only condition (1) above 008127 ** needs to be tested because (2) will always be true from the previous 008128 ** step. On the upward pass, both conditions are always true, so the 008129 ** upwards pass simply processes pages that were missed on the downward 008130 ** pass. 008131 */ 008132 for(i=1-nNew; i<nNew; i++){ 008133 int iPg = i<0 ? -i : i; 008134 assert( iPg>=0 && iPg<nNew ); 008135 if( abDone[iPg] ) continue; /* Skip pages already processed */ 008136 if( i>=0 /* On the upwards pass, or... */ 008137 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ 008138 ){ 008139 int iNew; 008140 int iOld; 008141 int nNewCell; 008142 008143 /* Verify condition (1): If cells are moving left, update iPg 008144 ** only after iPg-1 has already been updated. */ 008145 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); 008146 008147 /* Verify condition (2): If cells are moving right, update iPg 008148 ** only after iPg+1 has already been updated. */ 008149 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); 008150 008151 if( iPg==0 ){ 008152 iNew = iOld = 0; 008153 nNewCell = cntNew[0]; 008154 }else{ 008155 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; 008156 iNew = cntNew[iPg-1] + !leafData; 008157 nNewCell = cntNew[iPg] - iNew; 008158 } 008159 008160 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); 008161 if( rc ) goto balance_cleanup; 008162 abDone[iPg]++; 008163 apNew[iPg]->nFree = usableSpace-szNew[iPg]; 008164 assert( apNew[iPg]->nOverflow==0 ); 008165 assert( apNew[iPg]->nCell==nNewCell ); 008166 } 008167 } 008168 008169 /* All pages have been processed exactly once */ 008170 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); 008171 008172 assert( nOld>0 ); 008173 assert( nNew>0 ); 008174 008175 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 008176 /* The root page of the b-tree now contains no cells. The only sibling 008177 ** page is the right-child of the parent. Copy the contents of the 008178 ** child page into the parent, decreasing the overall height of the 008179 ** b-tree structure by one. This is described as the "balance-shallower" 008180 ** sub-algorithm in some documentation. 008181 ** 008182 ** If this is an auto-vacuum database, the call to copyNodeContent() 008183 ** sets all pointer-map entries corresponding to database image pages 008184 ** for which the pointer is stored within the content being copied. 008185 ** 008186 ** It is critical that the child page be defragmented before being 008187 ** copied into the parent, because if the parent is page 1 then it will 008188 ** by smaller than the child due to the database header, and so all the 008189 ** free space needs to be up front. 008190 */ 008191 assert( nNew==1 || CORRUPT_DB ); 008192 rc = defragmentPage(apNew[0], -1); 008193 testcase( rc!=SQLITE_OK ); 008194 assert( apNew[0]->nFree == 008195 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset 008196 - apNew[0]->nCell*2) 008197 || rc!=SQLITE_OK 008198 ); 008199 copyNodeContent(apNew[0], pParent, &rc); 008200 freePage(apNew[0], &rc); 008201 }else if( ISAUTOVACUUM && !leafCorrection ){ 008202 /* Fix the pointer map entries associated with the right-child of each 008203 ** sibling page. All other pointer map entries have already been taken 008204 ** care of. */ 008205 for(i=0; i<nNew; i++){ 008206 u32 key = get4byte(&apNew[i]->aData[8]); 008207 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 008208 } 008209 } 008210 008211 assert( pParent->isInit ); 008212 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", 008213 nOld, nNew, b.nCell)); 008214 008215 /* Free any old pages that were not reused as new pages. 008216 */ 008217 for(i=nNew; i<nOld; i++){ 008218 freePage(apOld[i], &rc); 008219 } 008220 008221 #if 0 008222 if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){ 008223 /* The ptrmapCheckPages() contains assert() statements that verify that 008224 ** all pointer map pages are set correctly. This is helpful while 008225 ** debugging. This is usually disabled because a corrupt database may 008226 ** cause an assert() statement to fail. */ 008227 ptrmapCheckPages(apNew, nNew); 008228 ptrmapCheckPages(&pParent, 1); 008229 } 008230 #endif 008231 008232 /* 008233 ** Cleanup before returning. 008234 */ 008235 balance_cleanup: 008236 sqlite3StackFree(0, b.apCell); 008237 for(i=0; i<nOld; i++){ 008238 releasePage(apOld[i]); 008239 } 008240 for(i=0; i<nNew; i++){ 008241 releasePage(apNew[i]); 008242 } 008243 008244 return rc; 008245 } 008246 008247 008248 /* 008249 ** This function is called when the root page of a b-tree structure is 008250 ** overfull (has one or more overflow pages). 008251 ** 008252 ** A new child page is allocated and the contents of the current root 008253 ** page, including overflow cells, are copied into the child. The root 008254 ** page is then overwritten to make it an empty page with the right-child 008255 ** pointer pointing to the new page. 008256 ** 008257 ** Before returning, all pointer-map entries corresponding to pages 008258 ** that the new child-page now contains pointers to are updated. The 008259 ** entry corresponding to the new right-child pointer of the root 008260 ** page is also updated. 008261 ** 008262 ** If successful, *ppChild is set to contain a reference to the child 008263 ** page and SQLITE_OK is returned. In this case the caller is required 008264 ** to call releasePage() on *ppChild exactly once. If an error occurs, 008265 ** an error code is returned and *ppChild is set to 0. 008266 */ 008267 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 008268 int rc; /* Return value from subprocedures */ 008269 MemPage *pChild = 0; /* Pointer to a new child page */ 008270 Pgno pgnoChild = 0; /* Page number of the new child page */ 008271 BtShared *pBt = pRoot->pBt; /* The BTree */ 008272 008273 assert( pRoot->nOverflow>0 ); 008274 assert( sqlite3_mutex_held(pBt->mutex) ); 008275 008276 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 008277 ** page that will become the new right-child of pPage. Copy the contents 008278 ** of the node stored on pRoot into the new child page. 008279 */ 008280 rc = sqlite3PagerWrite(pRoot->pDbPage); 008281 if( rc==SQLITE_OK ){ 008282 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 008283 copyNodeContent(pRoot, pChild, &rc); 008284 if( ISAUTOVACUUM ){ 008285 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 008286 } 008287 } 008288 if( rc ){ 008289 *ppChild = 0; 008290 releasePage(pChild); 008291 return rc; 008292 } 008293 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 008294 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 008295 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB ); 008296 008297 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); 008298 008299 /* Copy the overflow cells from pRoot to pChild */ 008300 memcpy(pChild->aiOvfl, pRoot->aiOvfl, 008301 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0])); 008302 memcpy(pChild->apOvfl, pRoot->apOvfl, 008303 pRoot->nOverflow*sizeof(pRoot->apOvfl[0])); 008304 pChild->nOverflow = pRoot->nOverflow; 008305 008306 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 008307 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 008308 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 008309 008310 *ppChild = pChild; 008311 return SQLITE_OK; 008312 } 008313 008314 /* 008315 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid 008316 ** on the same B-tree as pCur. 008317 ** 008318 ** This can if a database is corrupt with two or more SQL tables 008319 ** pointing to the same b-tree. If an insert occurs on one SQL table 008320 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL 008321 ** table linked to the same b-tree. If the secondary insert causes a 008322 ** rebalance, that can change content out from under the cursor on the 008323 ** first SQL table, violating invariants on the first insert. 008324 */ 008325 static int anotherValidCursor(BtCursor *pCur){ 008326 BtCursor *pOther; 008327 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){ 008328 if( pOther!=pCur 008329 && pOther->eState==CURSOR_VALID 008330 && pOther->pPage==pCur->pPage 008331 ){ 008332 return SQLITE_CORRUPT_BKPT; 008333 } 008334 } 008335 return SQLITE_OK; 008336 } 008337 008338 /* 008339 ** The page that pCur currently points to has just been modified in 008340 ** some way. This function figures out if this modification means the 008341 ** tree needs to be balanced, and if so calls the appropriate balancing 008342 ** routine. Balancing routines are: 008343 ** 008344 ** balance_quick() 008345 ** balance_deeper() 008346 ** balance_nonroot() 008347 */ 008348 static int balance(BtCursor *pCur){ 008349 int rc = SQLITE_OK; 008350 const int nMin = pCur->pBt->usableSize * 2 / 3; 008351 u8 aBalanceQuickSpace[13]; 008352 u8 *pFree = 0; 008353 008354 VVA_ONLY( int balance_quick_called = 0 ); 008355 VVA_ONLY( int balance_deeper_called = 0 ); 008356 008357 do { 008358 int iPage; 008359 MemPage *pPage = pCur->pPage; 008360 008361 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; 008362 if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ 008363 break; 008364 }else if( (iPage = pCur->iPage)==0 ){ 008365 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ 008366 /* The root page of the b-tree is overfull. In this case call the 008367 ** balance_deeper() function to create a new child for the root-page 008368 ** and copy the current contents of the root-page to it. The 008369 ** next iteration of the do-loop will balance the child page. 008370 */ 008371 assert( balance_deeper_called==0 ); 008372 VVA_ONLY( balance_deeper_called++ ); 008373 rc = balance_deeper(pPage, &pCur->apPage[1]); 008374 if( rc==SQLITE_OK ){ 008375 pCur->iPage = 1; 008376 pCur->ix = 0; 008377 pCur->aiIdx[0] = 0; 008378 pCur->apPage[0] = pPage; 008379 pCur->pPage = pCur->apPage[1]; 008380 assert( pCur->pPage->nOverflow ); 008381 } 008382 }else{ 008383 break; 008384 } 008385 }else{ 008386 MemPage * const pParent = pCur->apPage[iPage-1]; 008387 int const iIdx = pCur->aiIdx[iPage-1]; 008388 008389 rc = sqlite3PagerWrite(pParent->pDbPage); 008390 if( rc==SQLITE_OK && pParent->nFree<0 ){ 008391 rc = btreeComputeFreeSpace(pParent); 008392 } 008393 if( rc==SQLITE_OK ){ 008394 #ifndef SQLITE_OMIT_QUICKBALANCE 008395 if( pPage->intKeyLeaf 008396 && pPage->nOverflow==1 008397 && pPage->aiOvfl[0]==pPage->nCell 008398 && pParent->pgno!=1 008399 && pParent->nCell==iIdx 008400 ){ 008401 /* Call balance_quick() to create a new sibling of pPage on which 008402 ** to store the overflow cell. balance_quick() inserts a new cell 008403 ** into pParent, which may cause pParent overflow. If this 008404 ** happens, the next iteration of the do-loop will balance pParent 008405 ** use either balance_nonroot() or balance_deeper(). Until this 008406 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 008407 ** buffer. 008408 ** 008409 ** The purpose of the following assert() is to check that only a 008410 ** single call to balance_quick() is made for each call to this 008411 ** function. If this were not verified, a subtle bug involving reuse 008412 ** of the aBalanceQuickSpace[] might sneak in. 008413 */ 008414 assert( balance_quick_called==0 ); 008415 VVA_ONLY( balance_quick_called++ ); 008416 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 008417 }else 008418 #endif 008419 { 008420 /* In this case, call balance_nonroot() to redistribute cells 008421 ** between pPage and up to 2 of its sibling pages. This involves 008422 ** modifying the contents of pParent, which may cause pParent to 008423 ** become overfull or underfull. The next iteration of the do-loop 008424 ** will balance the parent page to correct this. 008425 ** 008426 ** If the parent page becomes overfull, the overflow cell or cells 008427 ** are stored in the pSpace buffer allocated immediately below. 008428 ** A subsequent iteration of the do-loop will deal with this by 008429 ** calling balance_nonroot() (balance_deeper() may be called first, 008430 ** but it doesn't deal with overflow cells - just moves them to a 008431 ** different page). Once this subsequent call to balance_nonroot() 008432 ** has completed, it is safe to release the pSpace buffer used by 008433 ** the previous call, as the overflow cell data will have been 008434 ** copied either into the body of a database page or into the new 008435 ** pSpace buffer passed to the latter call to balance_nonroot(). 008436 */ 008437 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 008438 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, 008439 pCur->hints&BTREE_BULKLOAD); 008440 if( pFree ){ 008441 /* If pFree is not NULL, it points to the pSpace buffer used 008442 ** by a previous call to balance_nonroot(). Its contents are 008443 ** now stored either on real database pages or within the 008444 ** new pSpace buffer, so it may be safely freed here. */ 008445 sqlite3PageFree(pFree); 008446 } 008447 008448 /* The pSpace buffer will be freed after the next call to 008449 ** balance_nonroot(), or just before this function returns, whichever 008450 ** comes first. */ 008451 pFree = pSpace; 008452 } 008453 } 008454 008455 pPage->nOverflow = 0; 008456 008457 /* The next iteration of the do-loop balances the parent page. */ 008458 releasePage(pPage); 008459 pCur->iPage--; 008460 assert( pCur->iPage>=0 ); 008461 pCur->pPage = pCur->apPage[pCur->iPage]; 008462 } 008463 }while( rc==SQLITE_OK ); 008464 008465 if( pFree ){ 008466 sqlite3PageFree(pFree); 008467 } 008468 return rc; 008469 } 008470 008471 /* Overwrite content from pX into pDest. Only do the write if the 008472 ** content is different from what is already there. 008473 */ 008474 static int btreeOverwriteContent( 008475 MemPage *pPage, /* MemPage on which writing will occur */ 008476 u8 *pDest, /* Pointer to the place to start writing */ 008477 const BtreePayload *pX, /* Source of data to write */ 008478 int iOffset, /* Offset of first byte to write */ 008479 int iAmt /* Number of bytes to be written */ 008480 ){ 008481 int nData = pX->nData - iOffset; 008482 if( nData<=0 ){ 008483 /* Overwritting with zeros */ 008484 int i; 008485 for(i=0; i<iAmt && pDest[i]==0; i++){} 008486 if( i<iAmt ){ 008487 int rc = sqlite3PagerWrite(pPage->pDbPage); 008488 if( rc ) return rc; 008489 memset(pDest + i, 0, iAmt - i); 008490 } 008491 }else{ 008492 if( nData<iAmt ){ 008493 /* Mixed read data and zeros at the end. Make a recursive call 008494 ** to write the zeros then fall through to write the real data */ 008495 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData, 008496 iAmt-nData); 008497 if( rc ) return rc; 008498 iAmt = nData; 008499 } 008500 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){ 008501 int rc = sqlite3PagerWrite(pPage->pDbPage); 008502 if( rc ) return rc; 008503 /* In a corrupt database, it is possible for the source and destination 008504 ** buffers to overlap. This is harmless since the database is already 008505 ** corrupt but it does cause valgrind and ASAN warnings. So use 008506 ** memmove(). */ 008507 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt); 008508 } 008509 } 008510 return SQLITE_OK; 008511 } 008512 008513 /* 008514 ** Overwrite the cell that cursor pCur is pointing to with fresh content 008515 ** contained in pX. 008516 */ 008517 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ 008518 int iOffset; /* Next byte of pX->pData to write */ 008519 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 008520 int rc; /* Return code */ 008521 MemPage *pPage = pCur->pPage; /* Page being written */ 008522 BtShared *pBt; /* Btree */ 008523 Pgno ovflPgno; /* Next overflow page to write */ 008524 u32 ovflPageSize; /* Size to write on overflow page */ 008525 008526 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd 008527 || pCur->info.pPayload < pPage->aData + pPage->cellOffset 008528 ){ 008529 return SQLITE_CORRUPT_BKPT; 008530 } 008531 /* Overwrite the local portion first */ 008532 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 008533 0, pCur->info.nLocal); 008534 if( rc ) return rc; 008535 if( pCur->info.nLocal==nTotal ) return SQLITE_OK; 008536 008537 /* Now overwrite the overflow pages */ 008538 iOffset = pCur->info.nLocal; 008539 assert( nTotal>=0 ); 008540 assert( iOffset>=0 ); 008541 ovflPgno = get4byte(pCur->info.pPayload + iOffset); 008542 pBt = pPage->pBt; 008543 ovflPageSize = pBt->usableSize - 4; 008544 do{ 008545 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0); 008546 if( rc ) return rc; 008547 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 ){ 008548 rc = SQLITE_CORRUPT_BKPT; 008549 }else{ 008550 if( iOffset+ovflPageSize<(u32)nTotal ){ 008551 ovflPgno = get4byte(pPage->aData); 008552 }else{ 008553 ovflPageSize = nTotal - iOffset; 008554 } 008555 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX, 008556 iOffset, ovflPageSize); 008557 } 008558 sqlite3PagerUnref(pPage->pDbPage); 008559 if( rc ) return rc; 008560 iOffset += ovflPageSize; 008561 }while( iOffset<nTotal ); 008562 return SQLITE_OK; 008563 } 008564 008565 008566 /* 008567 ** Insert a new record into the BTree. The content of the new record 008568 ** is described by the pX object. The pCur cursor is used only to 008569 ** define what table the record should be inserted into, and is left 008570 ** pointing at a random location. 008571 ** 008572 ** For a table btree (used for rowid tables), only the pX.nKey value of 008573 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the 008574 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields 008575 ** hold the content of the row. 008576 ** 008577 ** For an index btree (used for indexes and WITHOUT ROWID tables), the 008578 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The 008579 ** pX.pData,nData,nZero fields must be zero. 008580 ** 008581 ** If the seekResult parameter is non-zero, then a successful call to 008582 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already 008583 ** been performed. In other words, if seekResult!=0 then the cursor 008584 ** is currently pointing to a cell that will be adjacent to the cell 008585 ** to be inserted. If seekResult<0 then pCur points to a cell that is 008586 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell 008587 ** that is larger than (pKey,nKey). 008588 ** 008589 ** If seekResult==0, that means pCur is pointing at some unknown location. 008590 ** In that case, this routine must seek the cursor to the correct insertion 008591 ** point for (pKey,nKey) before doing the insertion. For index btrees, 008592 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked 008593 ** key values and pX->aMem can be used instead of pX->pKey to avoid having 008594 ** to decode the key. 008595 */ 008596 int sqlite3BtreeInsert( 008597 BtCursor *pCur, /* Insert data into the table of this cursor */ 008598 const BtreePayload *pX, /* Content of the row to be inserted */ 008599 int flags, /* True if this is likely an append */ 008600 int seekResult /* Result of prior MovetoUnpacked() call */ 008601 ){ 008602 int rc; 008603 int loc = seekResult; /* -1: before desired location +1: after */ 008604 int szNew = 0; 008605 int idx; 008606 MemPage *pPage; 008607 Btree *p = pCur->pBtree; 008608 BtShared *pBt = p->pBt; 008609 unsigned char *oldCell; 008610 unsigned char *newCell = 0; 008611 008612 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags ); 008613 008614 if( pCur->eState==CURSOR_FAULT ){ 008615 assert( pCur->skipNext!=SQLITE_OK ); 008616 return pCur->skipNext; 008617 } 008618 008619 assert( cursorOwnsBtShared(pCur) ); 008620 assert( (pCur->curFlags & BTCF_WriteFlag)!=0 008621 && pBt->inTransaction==TRANS_WRITE 008622 && (pBt->btsFlags & BTS_READ_ONLY)==0 ); 008623 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 008624 008625 /* Assert that the caller has been consistent. If this cursor was opened 008626 ** expecting an index b-tree, then the caller should be inserting blob 008627 ** keys with no associated data. If the cursor was opened expecting an 008628 ** intkey table, the caller should be inserting integer keys with a 008629 ** blob of associated data. */ 008630 assert( (pX->pKey==0)==(pCur->pKeyInfo==0) ); 008631 008632 /* Save the positions of any other cursors open on this table. 008633 ** 008634 ** In some cases, the call to btreeMoveto() below is a no-op. For 008635 ** example, when inserting data into a table with auto-generated integer 008636 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 008637 ** integer key to use. It then calls this function to actually insert the 008638 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 008639 ** that the cursor is already where it needs to be and returns without 008640 ** doing any work. To avoid thwarting these optimizations, it is important 008641 ** not to clear the cursor here. 008642 */ 008643 if( pCur->curFlags & BTCF_Multiple ){ 008644 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 008645 if( rc ) return rc; 008646 } 008647 008648 if( pCur->pKeyInfo==0 ){ 008649 assert( pX->pKey==0 ); 008650 /* If this is an insert into a table b-tree, invalidate any incrblob 008651 ** cursors open on the row being replaced */ 008652 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); 008653 008654 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 008655 ** to a row with the same key as the new entry being inserted. 008656 */ 008657 #ifdef SQLITE_DEBUG 008658 if( flags & BTREE_SAVEPOSITION ){ 008659 assert( pCur->curFlags & BTCF_ValidNKey ); 008660 assert( pX->nKey==pCur->info.nKey ); 008661 assert( loc==0 ); 008662 } 008663 #endif 008664 008665 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply 008666 ** that the cursor is not pointing to a row to be overwritten. 008667 ** So do a complete check. 008668 */ 008669 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){ 008670 /* The cursor is pointing to the entry that is to be 008671 ** overwritten */ 008672 assert( pX->nData>=0 && pX->nZero>=0 ); 008673 if( pCur->info.nSize!=0 008674 && pCur->info.nPayload==(u32)pX->nData+pX->nZero 008675 ){ 008676 /* New entry is the same size as the old. Do an overwrite */ 008677 return btreeOverwriteCell(pCur, pX); 008678 } 008679 assert( loc==0 ); 008680 }else if( loc==0 ){ 008681 /* The cursor is *not* pointing to the cell to be overwritten, nor 008682 ** to an adjacent cell. Move the cursor so that it is pointing either 008683 ** to the cell to be overwritten or an adjacent cell. 008684 */ 008685 rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc); 008686 if( rc ) return rc; 008687 } 008688 }else{ 008689 /* This is an index or a WITHOUT ROWID table */ 008690 008691 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 008692 ** to a row with the same key as the new entry being inserted. 008693 */ 008694 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 ); 008695 008696 /* If the cursor is not already pointing either to the cell to be 008697 ** overwritten, or if a new cell is being inserted, if the cursor is 008698 ** not pointing to an immediately adjacent cell, then move the cursor 008699 ** so that it does. 008700 */ 008701 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){ 008702 if( pX->nMem ){ 008703 UnpackedRecord r; 008704 r.pKeyInfo = pCur->pKeyInfo; 008705 r.aMem = pX->aMem; 008706 r.nField = pX->nMem; 008707 r.default_rc = 0; 008708 r.errCode = 0; 008709 r.r1 = 0; 008710 r.r2 = 0; 008711 r.eqSeen = 0; 008712 rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc); 008713 }else{ 008714 rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc); 008715 } 008716 if( rc ) return rc; 008717 } 008718 008719 /* If the cursor is currently pointing to an entry to be overwritten 008720 ** and the new content is the same as as the old, then use the 008721 ** overwrite optimization. 008722 */ 008723 if( loc==0 ){ 008724 getCellInfo(pCur); 008725 if( pCur->info.nKey==pX->nKey ){ 008726 BtreePayload x2; 008727 x2.pData = pX->pKey; 008728 x2.nData = pX->nKey; 008729 x2.nZero = 0; 008730 return btreeOverwriteCell(pCur, &x2); 008731 } 008732 } 008733 008734 } 008735 assert( pCur->eState==CURSOR_VALID 008736 || (pCur->eState==CURSOR_INVALID && loc) 008737 || CORRUPT_DB ); 008738 008739 pPage = pCur->pPage; 008740 assert( pPage->intKey || pX->nKey>=0 ); 008741 assert( pPage->leaf || !pPage->intKey ); 008742 if( pPage->nFree<0 ){ 008743 rc = btreeComputeFreeSpace(pPage); 008744 if( rc ) return rc; 008745 } 008746 008747 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 008748 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, 008749 loc==0 ? "overwrite" : "new entry")); 008750 assert( pPage->isInit ); 008751 newCell = pBt->pTmpSpace; 008752 assert( newCell!=0 ); 008753 rc = fillInCell(pPage, newCell, pX, &szNew); 008754 if( rc ) goto end_insert; 008755 assert( szNew==pPage->xCellSize(pPage, newCell) ); 008756 assert( szNew <= MX_CELL_SIZE(pBt) ); 008757 idx = pCur->ix; 008758 if( loc==0 ){ 008759 CellInfo info; 008760 assert( idx<pPage->nCell ); 008761 rc = sqlite3PagerWrite(pPage->pDbPage); 008762 if( rc ){ 008763 goto end_insert; 008764 } 008765 oldCell = findCell(pPage, idx); 008766 if( !pPage->leaf ){ 008767 memcpy(newCell, oldCell, 4); 008768 } 008769 rc = clearCell(pPage, oldCell, &info); 008770 testcase( pCur->curFlags & BTCF_ValidOvfl ); 008771 invalidateOverflowCache(pCur); 008772 if( info.nSize==szNew && info.nLocal==info.nPayload 008773 && (!ISAUTOVACUUM || szNew<pPage->minLocal) 008774 ){ 008775 /* Overwrite the old cell with the new if they are the same size. 008776 ** We could also try to do this if the old cell is smaller, then add 008777 ** the leftover space to the free list. But experiments show that 008778 ** doing that is no faster then skipping this optimization and just 008779 ** calling dropCell() and insertCell(). 008780 ** 008781 ** This optimization cannot be used on an autovacuum database if the 008782 ** new entry uses overflow pages, as the insertCell() call below is 008783 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */ 008784 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */ 008785 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){ 008786 return SQLITE_CORRUPT_BKPT; 008787 } 008788 if( oldCell+szNew > pPage->aDataEnd ){ 008789 return SQLITE_CORRUPT_BKPT; 008790 } 008791 memcpy(oldCell, newCell, szNew); 008792 return SQLITE_OK; 008793 } 008794 dropCell(pPage, idx, info.nSize, &rc); 008795 if( rc ) goto end_insert; 008796 }else if( loc<0 && pPage->nCell>0 ){ 008797 assert( pPage->leaf ); 008798 idx = ++pCur->ix; 008799 pCur->curFlags &= ~BTCF_ValidNKey; 008800 }else{ 008801 assert( pPage->leaf ); 008802 } 008803 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); 008804 assert( pPage->nOverflow==0 || rc==SQLITE_OK ); 008805 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 008806 008807 /* If no error has occurred and pPage has an overflow cell, call balance() 008808 ** to redistribute the cells within the tree. Since balance() may move 008809 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey 008810 ** variables. 008811 ** 008812 ** Previous versions of SQLite called moveToRoot() to move the cursor 008813 ** back to the root page as balance() used to invalidate the contents 008814 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 008815 ** set the cursor state to "invalid". This makes common insert operations 008816 ** slightly faster. 008817 ** 008818 ** There is a subtle but important optimization here too. When inserting 008819 ** multiple records into an intkey b-tree using a single cursor (as can 008820 ** happen while processing an "INSERT INTO ... SELECT" statement), it 008821 ** is advantageous to leave the cursor pointing to the last entry in 008822 ** the b-tree if possible. If the cursor is left pointing to the last 008823 ** entry in the table, and the next row inserted has an integer key 008824 ** larger than the largest existing key, it is possible to insert the 008825 ** row without seeking the cursor. This can be a big performance boost. 008826 */ 008827 pCur->info.nSize = 0; 008828 if( pPage->nOverflow ){ 008829 assert( rc==SQLITE_OK ); 008830 pCur->curFlags &= ~(BTCF_ValidNKey); 008831 rc = balance(pCur); 008832 008833 /* Must make sure nOverflow is reset to zero even if the balance() 008834 ** fails. Internal data structure corruption will result otherwise. 008835 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 008836 ** from trying to save the current position of the cursor. */ 008837 pCur->pPage->nOverflow = 0; 008838 pCur->eState = CURSOR_INVALID; 008839 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){ 008840 btreeReleaseAllCursorPages(pCur); 008841 if( pCur->pKeyInfo ){ 008842 assert( pCur->pKey==0 ); 008843 pCur->pKey = sqlite3Malloc( pX->nKey ); 008844 if( pCur->pKey==0 ){ 008845 rc = SQLITE_NOMEM; 008846 }else{ 008847 memcpy(pCur->pKey, pX->pKey, pX->nKey); 008848 } 008849 } 008850 pCur->eState = CURSOR_REQUIRESEEK; 008851 pCur->nKey = pX->nKey; 008852 } 008853 } 008854 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 ); 008855 008856 end_insert: 008857 return rc; 008858 } 008859 008860 /* 008861 ** Delete the entry that the cursor is pointing to. 008862 ** 008863 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then 008864 ** the cursor is left pointing at an arbitrary location after the delete. 008865 ** But if that bit is set, then the cursor is left in a state such that 008866 ** the next call to BtreeNext() or BtreePrev() moves it to the same row 008867 ** as it would have been on if the call to BtreeDelete() had been omitted. 008868 ** 008869 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes 008870 ** associated with a single table entry and its indexes. Only one of those 008871 ** deletes is considered the "primary" delete. The primary delete occurs 008872 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete 008873 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag. 008874 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation, 008875 ** but which might be used by alternative storage engines. 008876 */ 008877 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ 008878 Btree *p = pCur->pBtree; 008879 BtShared *pBt = p->pBt; 008880 int rc; /* Return code */ 008881 MemPage *pPage; /* Page to delete cell from */ 008882 unsigned char *pCell; /* Pointer to cell to delete */ 008883 int iCellIdx; /* Index of cell to delete */ 008884 int iCellDepth; /* Depth of node containing pCell */ 008885 CellInfo info; /* Size of the cell being deleted */ 008886 int bSkipnext = 0; /* Leaf cursor in SKIPNEXT state */ 008887 u8 bPreserve = flags & BTREE_SAVEPOSITION; /* Keep cursor valid */ 008888 008889 assert( cursorOwnsBtShared(pCur) ); 008890 assert( pBt->inTransaction==TRANS_WRITE ); 008891 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 008892 assert( pCur->curFlags & BTCF_WriteFlag ); 008893 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 008894 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 008895 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 ); 008896 if( pCur->eState==CURSOR_REQUIRESEEK ){ 008897 rc = btreeRestoreCursorPosition(pCur); 008898 if( rc ) return rc; 008899 } 008900 assert( pCur->eState==CURSOR_VALID ); 008901 008902 iCellDepth = pCur->iPage; 008903 iCellIdx = pCur->ix; 008904 pPage = pCur->pPage; 008905 pCell = findCell(pPage, iCellIdx); 008906 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT; 008907 008908 /* If the bPreserve flag is set to true, then the cursor position must 008909 ** be preserved following this delete operation. If the current delete 008910 ** will cause a b-tree rebalance, then this is done by saving the cursor 008911 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 008912 ** returning. 008913 ** 008914 ** Or, if the current delete will not cause a rebalance, then the cursor 008915 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately 008916 ** before or after the deleted entry. In this case set bSkipnext to true. */ 008917 if( bPreserve ){ 008918 if( !pPage->leaf 008919 || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3) 008920 || pPage->nCell==1 /* See dbfuzz001.test for a test case */ 008921 ){ 008922 /* A b-tree rebalance will be required after deleting this entry. 008923 ** Save the cursor key. */ 008924 rc = saveCursorKey(pCur); 008925 if( rc ) return rc; 008926 }else{ 008927 bSkipnext = 1; 008928 } 008929 } 008930 008931 /* If the page containing the entry to delete is not a leaf page, move 008932 ** the cursor to the largest entry in the tree that is smaller than 008933 ** the entry being deleted. This cell will replace the cell being deleted 008934 ** from the internal node. The 'previous' entry is used for this instead 008935 ** of the 'next' entry, as the previous entry is always a part of the 008936 ** sub-tree headed by the child page of the cell being deleted. This makes 008937 ** balancing the tree following the delete operation easier. */ 008938 if( !pPage->leaf ){ 008939 rc = sqlite3BtreePrevious(pCur, 0); 008940 assert( rc!=SQLITE_DONE ); 008941 if( rc ) return rc; 008942 } 008943 008944 /* Save the positions of any other cursors open on this table before 008945 ** making any modifications. */ 008946 if( pCur->curFlags & BTCF_Multiple ){ 008947 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 008948 if( rc ) return rc; 008949 } 008950 008951 /* If this is a delete operation to remove a row from a table b-tree, 008952 ** invalidate any incrblob cursors open on the row being deleted. */ 008953 if( pCur->pKeyInfo==0 ){ 008954 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0); 008955 } 008956 008957 /* Make the page containing the entry to be deleted writable. Then free any 008958 ** overflow pages associated with the entry and finally remove the cell 008959 ** itself from within the page. */ 008960 rc = sqlite3PagerWrite(pPage->pDbPage); 008961 if( rc ) return rc; 008962 rc = clearCell(pPage, pCell, &info); 008963 dropCell(pPage, iCellIdx, info.nSize, &rc); 008964 if( rc ) return rc; 008965 008966 /* If the cell deleted was not located on a leaf page, then the cursor 008967 ** is currently pointing to the largest entry in the sub-tree headed 008968 ** by the child-page of the cell that was just deleted from an internal 008969 ** node. The cell from the leaf node needs to be moved to the internal 008970 ** node to replace the deleted cell. */ 008971 if( !pPage->leaf ){ 008972 MemPage *pLeaf = pCur->pPage; 008973 int nCell; 008974 Pgno n; 008975 unsigned char *pTmp; 008976 008977 if( pLeaf->nFree<0 ){ 008978 rc = btreeComputeFreeSpace(pLeaf); 008979 if( rc ) return rc; 008980 } 008981 if( iCellDepth<pCur->iPage-1 ){ 008982 n = pCur->apPage[iCellDepth+1]->pgno; 008983 }else{ 008984 n = pCur->pPage->pgno; 008985 } 008986 pCell = findCell(pLeaf, pLeaf->nCell-1); 008987 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT; 008988 nCell = pLeaf->xCellSize(pLeaf, pCell); 008989 assert( MX_CELL_SIZE(pBt) >= nCell ); 008990 pTmp = pBt->pTmpSpace; 008991 assert( pTmp!=0 ); 008992 rc = sqlite3PagerWrite(pLeaf->pDbPage); 008993 if( rc==SQLITE_OK ){ 008994 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); 008995 } 008996 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 008997 if( rc ) return rc; 008998 } 008999 009000 /* Balance the tree. If the entry deleted was located on a leaf page, 009001 ** then the cursor still points to that page. In this case the first 009002 ** call to balance() repairs the tree, and the if(...) condition is 009003 ** never true. 009004 ** 009005 ** Otherwise, if the entry deleted was on an internal node page, then 009006 ** pCur is pointing to the leaf page from which a cell was removed to 009007 ** replace the cell deleted from the internal node. This is slightly 009008 ** tricky as the leaf node may be underfull, and the internal node may 009009 ** be either under or overfull. In this case run the balancing algorithm 009010 ** on the leaf node first. If the balance proceeds far enough up the 009011 ** tree that we can be sure that any problem in the internal node has 009012 ** been corrected, so be it. Otherwise, after balancing the leaf node, 009013 ** walk the cursor up the tree to the internal node and balance it as 009014 ** well. */ 009015 rc = balance(pCur); 009016 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 009017 releasePageNotNull(pCur->pPage); 009018 pCur->iPage--; 009019 while( pCur->iPage>iCellDepth ){ 009020 releasePage(pCur->apPage[pCur->iPage--]); 009021 } 009022 pCur->pPage = pCur->apPage[pCur->iPage]; 009023 rc = balance(pCur); 009024 } 009025 009026 if( rc==SQLITE_OK ){ 009027 if( bSkipnext ){ 009028 assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) ); 009029 assert( pPage==pCur->pPage || CORRUPT_DB ); 009030 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell ); 009031 pCur->eState = CURSOR_SKIPNEXT; 009032 if( iCellIdx>=pPage->nCell ){ 009033 pCur->skipNext = -1; 009034 pCur->ix = pPage->nCell-1; 009035 }else{ 009036 pCur->skipNext = 1; 009037 } 009038 }else{ 009039 rc = moveToRoot(pCur); 009040 if( bPreserve ){ 009041 btreeReleaseAllCursorPages(pCur); 009042 pCur->eState = CURSOR_REQUIRESEEK; 009043 } 009044 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK; 009045 } 009046 } 009047 return rc; 009048 } 009049 009050 /* 009051 ** Create a new BTree table. Write into *piTable the page 009052 ** number for the root page of the new table. 009053 ** 009054 ** The type of type is determined by the flags parameter. Only the 009055 ** following values of flags are currently in use. Other values for 009056 ** flags might not work: 009057 ** 009058 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 009059 ** BTREE_ZERODATA Used for SQL indices 009060 */ 009061 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ 009062 BtShared *pBt = p->pBt; 009063 MemPage *pRoot; 009064 Pgno pgnoRoot; 009065 int rc; 009066 int ptfFlags; /* Page-type flage for the root page of new table */ 009067 009068 assert( sqlite3BtreeHoldsMutex(p) ); 009069 assert( pBt->inTransaction==TRANS_WRITE ); 009070 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009071 009072 #ifdef SQLITE_OMIT_AUTOVACUUM 009073 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009074 if( rc ){ 009075 return rc; 009076 } 009077 #else 009078 if( pBt->autoVacuum ){ 009079 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 009080 MemPage *pPageMove; /* The page to move to. */ 009081 009082 /* Creating a new table may probably require moving an existing database 009083 ** to make room for the new tables root page. In case this page turns 009084 ** out to be an overflow page, delete all overflow page-map caches 009085 ** held by open cursors. 009086 */ 009087 invalidateAllOverflowCache(pBt); 009088 009089 /* Read the value of meta[3] from the database to determine where the 009090 ** root page of the new table should go. meta[3] is the largest root-page 009091 ** created so far, so the new root-page is (meta[3]+1). 009092 */ 009093 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 009094 pgnoRoot++; 009095 009096 /* The new root-page may not be allocated on a pointer-map page, or the 009097 ** PENDING_BYTE page. 009098 */ 009099 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 009100 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 009101 pgnoRoot++; 009102 } 009103 assert( pgnoRoot>=3 || CORRUPT_DB ); 009104 testcase( pgnoRoot<3 ); 009105 009106 /* Allocate a page. The page that currently resides at pgnoRoot will 009107 ** be moved to the allocated page (unless the allocated page happens 009108 ** to reside at pgnoRoot). 009109 */ 009110 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT); 009111 if( rc!=SQLITE_OK ){ 009112 return rc; 009113 } 009114 009115 if( pgnoMove!=pgnoRoot ){ 009116 /* pgnoRoot is the page that will be used for the root-page of 009117 ** the new table (assuming an error did not occur). But we were 009118 ** allocated pgnoMove. If required (i.e. if it was not allocated 009119 ** by extending the file), the current page at position pgnoMove 009120 ** is already journaled. 009121 */ 009122 u8 eType = 0; 009123 Pgno iPtrPage = 0; 009124 009125 /* Save the positions of any open cursors. This is required in 009126 ** case they are holding a reference to an xFetch reference 009127 ** corresponding to page pgnoRoot. */ 009128 rc = saveAllCursors(pBt, 0, 0); 009129 releasePage(pPageMove); 009130 if( rc!=SQLITE_OK ){ 009131 return rc; 009132 } 009133 009134 /* Move the page currently at pgnoRoot to pgnoMove. */ 009135 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 009136 if( rc!=SQLITE_OK ){ 009137 return rc; 009138 } 009139 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 009140 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 009141 rc = SQLITE_CORRUPT_BKPT; 009142 } 009143 if( rc!=SQLITE_OK ){ 009144 releasePage(pRoot); 009145 return rc; 009146 } 009147 assert( eType!=PTRMAP_ROOTPAGE ); 009148 assert( eType!=PTRMAP_FREEPAGE ); 009149 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 009150 releasePage(pRoot); 009151 009152 /* Obtain the page at pgnoRoot */ 009153 if( rc!=SQLITE_OK ){ 009154 return rc; 009155 } 009156 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 009157 if( rc!=SQLITE_OK ){ 009158 return rc; 009159 } 009160 rc = sqlite3PagerWrite(pRoot->pDbPage); 009161 if( rc!=SQLITE_OK ){ 009162 releasePage(pRoot); 009163 return rc; 009164 } 009165 }else{ 009166 pRoot = pPageMove; 009167 } 009168 009169 /* Update the pointer-map and meta-data with the new root-page number. */ 009170 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 009171 if( rc ){ 009172 releasePage(pRoot); 009173 return rc; 009174 } 009175 009176 /* When the new root page was allocated, page 1 was made writable in 009177 ** order either to increase the database filesize, or to decrement the 009178 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 009179 */ 009180 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 009181 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 009182 if( NEVER(rc) ){ 009183 releasePage(pRoot); 009184 return rc; 009185 } 009186 009187 }else{ 009188 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009189 if( rc ) return rc; 009190 } 009191 #endif 009192 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 009193 if( createTabFlags & BTREE_INTKEY ){ 009194 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 009195 }else{ 009196 ptfFlags = PTF_ZERODATA | PTF_LEAF; 009197 } 009198 zeroPage(pRoot, ptfFlags); 009199 sqlite3PagerUnref(pRoot->pDbPage); 009200 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 009201 *piTable = (int)pgnoRoot; 009202 return SQLITE_OK; 009203 } 009204 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 009205 int rc; 009206 sqlite3BtreeEnter(p); 009207 rc = btreeCreateTable(p, piTable, flags); 009208 sqlite3BtreeLeave(p); 009209 return rc; 009210 } 009211 009212 /* 009213 ** Erase the given database page and all its children. Return 009214 ** the page to the freelist. 009215 */ 009216 static int clearDatabasePage( 009217 BtShared *pBt, /* The BTree that contains the table */ 009218 Pgno pgno, /* Page number to clear */ 009219 int freePageFlag, /* Deallocate page if true */ 009220 int *pnChange /* Add number of Cells freed to this counter */ 009221 ){ 009222 MemPage *pPage; 009223 int rc; 009224 unsigned char *pCell; 009225 int i; 009226 int hdr; 009227 CellInfo info; 009228 009229 assert( sqlite3_mutex_held(pBt->mutex) ); 009230 if( pgno>btreePagecount(pBt) ){ 009231 return SQLITE_CORRUPT_BKPT; 009232 } 009233 rc = getAndInitPage(pBt, pgno, &pPage, 0, 0); 009234 if( rc ) return rc; 009235 if( pPage->bBusy ){ 009236 rc = SQLITE_CORRUPT_BKPT; 009237 goto cleardatabasepage_out; 009238 } 009239 pPage->bBusy = 1; 009240 hdr = pPage->hdrOffset; 009241 for(i=0; i<pPage->nCell; i++){ 009242 pCell = findCell(pPage, i); 009243 if( !pPage->leaf ){ 009244 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 009245 if( rc ) goto cleardatabasepage_out; 009246 } 009247 rc = clearCell(pPage, pCell, &info); 009248 if( rc ) goto cleardatabasepage_out; 009249 } 009250 if( !pPage->leaf ){ 009251 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); 009252 if( rc ) goto cleardatabasepage_out; 009253 }else if( pnChange ){ 009254 assert( pPage->intKey || CORRUPT_DB ); 009255 testcase( !pPage->intKey ); 009256 *pnChange += pPage->nCell; 009257 } 009258 if( freePageFlag ){ 009259 freePage(pPage, &rc); 009260 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 009261 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF); 009262 } 009263 009264 cleardatabasepage_out: 009265 pPage->bBusy = 0; 009266 releasePage(pPage); 009267 return rc; 009268 } 009269 009270 /* 009271 ** Delete all information from a single table in the database. iTable is 009272 ** the page number of the root of the table. After this routine returns, 009273 ** the root page is empty, but still exists. 009274 ** 009275 ** This routine will fail with SQLITE_LOCKED if there are any open 009276 ** read cursors on the table. Open write cursors are moved to the 009277 ** root of the table. 009278 ** 009279 ** If pnChange is not NULL, then table iTable must be an intkey table. The 009280 ** integer value pointed to by pnChange is incremented by the number of 009281 ** entries in the table. 009282 */ 009283 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ 009284 int rc; 009285 BtShared *pBt = p->pBt; 009286 sqlite3BtreeEnter(p); 009287 assert( p->inTrans==TRANS_WRITE ); 009288 009289 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 009290 009291 if( SQLITE_OK==rc ){ 009292 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 009293 ** is the root of a table b-tree - if it is not, the following call is 009294 ** a no-op). */ 009295 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); 009296 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 009297 } 009298 sqlite3BtreeLeave(p); 009299 return rc; 009300 } 009301 009302 /* 009303 ** Delete all information from the single table that pCur is open on. 009304 ** 009305 ** This routine only work for pCur on an ephemeral table. 009306 */ 009307 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ 009308 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0); 009309 } 009310 009311 /* 009312 ** Erase all information in a table and add the root of the table to 009313 ** the freelist. Except, the root of the principle table (the one on 009314 ** page 1) is never added to the freelist. 009315 ** 009316 ** This routine will fail with SQLITE_LOCKED if there are any open 009317 ** cursors on the table. 009318 ** 009319 ** If AUTOVACUUM is enabled and the page at iTable is not the last 009320 ** root page in the database file, then the last root page 009321 ** in the database file is moved into the slot formerly occupied by 009322 ** iTable and that last slot formerly occupied by the last root page 009323 ** is added to the freelist instead of iTable. In this say, all 009324 ** root pages are kept at the beginning of the database file, which 009325 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 009326 ** page number that used to be the last root page in the file before 009327 ** the move. If no page gets moved, *piMoved is set to 0. 009328 ** The last root page is recorded in meta[3] and the value of 009329 ** meta[3] is updated by this procedure. 009330 */ 009331 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 009332 int rc; 009333 MemPage *pPage = 0; 009334 BtShared *pBt = p->pBt; 009335 009336 assert( sqlite3BtreeHoldsMutex(p) ); 009337 assert( p->inTrans==TRANS_WRITE ); 009338 assert( iTable>=2 ); 009339 if( iTable>btreePagecount(pBt) ){ 009340 return SQLITE_CORRUPT_BKPT; 009341 } 009342 009343 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 009344 if( rc ) return rc; 009345 rc = sqlite3BtreeClearTable(p, iTable, 0); 009346 if( rc ){ 009347 releasePage(pPage); 009348 return rc; 009349 } 009350 009351 *piMoved = 0; 009352 009353 #ifdef SQLITE_OMIT_AUTOVACUUM 009354 freePage(pPage, &rc); 009355 releasePage(pPage); 009356 #else 009357 if( pBt->autoVacuum ){ 009358 Pgno maxRootPgno; 009359 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 009360 009361 if( iTable==maxRootPgno ){ 009362 /* If the table being dropped is the table with the largest root-page 009363 ** number in the database, put the root page on the free list. 009364 */ 009365 freePage(pPage, &rc); 009366 releasePage(pPage); 009367 if( rc!=SQLITE_OK ){ 009368 return rc; 009369 } 009370 }else{ 009371 /* The table being dropped does not have the largest root-page 009372 ** number in the database. So move the page that does into the 009373 ** gap left by the deleted root-page. 009374 */ 009375 MemPage *pMove; 009376 releasePage(pPage); 009377 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 009378 if( rc!=SQLITE_OK ){ 009379 return rc; 009380 } 009381 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 009382 releasePage(pMove); 009383 if( rc!=SQLITE_OK ){ 009384 return rc; 009385 } 009386 pMove = 0; 009387 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 009388 freePage(pMove, &rc); 009389 releasePage(pMove); 009390 if( rc!=SQLITE_OK ){ 009391 return rc; 009392 } 009393 *piMoved = maxRootPgno; 009394 } 009395 009396 /* Set the new 'max-root-page' value in the database header. This 009397 ** is the old value less one, less one more if that happens to 009398 ** be a root-page number, less one again if that is the 009399 ** PENDING_BYTE_PAGE. 009400 */ 009401 maxRootPgno--; 009402 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 009403 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 009404 maxRootPgno--; 009405 } 009406 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 009407 009408 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 009409 }else{ 009410 freePage(pPage, &rc); 009411 releasePage(pPage); 009412 } 009413 #endif 009414 return rc; 009415 } 009416 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 009417 int rc; 009418 sqlite3BtreeEnter(p); 009419 rc = btreeDropTable(p, iTable, piMoved); 009420 sqlite3BtreeLeave(p); 009421 return rc; 009422 } 009423 009424 009425 /* 009426 ** This function may only be called if the b-tree connection already 009427 ** has a read or write transaction open on the database. 009428 ** 009429 ** Read the meta-information out of a database file. Meta[0] 009430 ** is the number of free pages currently in the database. Meta[1] 009431 ** through meta[15] are available for use by higher layers. Meta[0] 009432 ** is read-only, the others are read/write. 009433 ** 009434 ** The schema layer numbers meta values differently. At the schema 009435 ** layer (and the SetCookie and ReadCookie opcodes) the number of 009436 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 009437 ** 009438 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead 009439 ** of reading the value out of the header, it instead loads the "DataVersion" 009440 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the 009441 ** database file. It is a number computed by the pager. But its access 009442 ** pattern is the same as header meta values, and so it is convenient to 009443 ** read it from this routine. 009444 */ 009445 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 009446 BtShared *pBt = p->pBt; 009447 009448 sqlite3BtreeEnter(p); 009449 assert( p->inTrans>TRANS_NONE ); 009450 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); 009451 assert( pBt->pPage1 ); 009452 assert( idx>=0 && idx<=15 ); 009453 009454 if( idx==BTREE_DATA_VERSION ){ 009455 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion; 009456 }else{ 009457 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 009458 } 009459 009460 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 009461 ** database, mark the database as read-only. */ 009462 #ifdef SQLITE_OMIT_AUTOVACUUM 009463 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){ 009464 pBt->btsFlags |= BTS_READ_ONLY; 009465 } 009466 #endif 009467 009468 sqlite3BtreeLeave(p); 009469 } 009470 009471 /* 009472 ** Write meta-information back into the database. Meta[0] is 009473 ** read-only and may not be written. 009474 */ 009475 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 009476 BtShared *pBt = p->pBt; 009477 unsigned char *pP1; 009478 int rc; 009479 assert( idx>=1 && idx<=15 ); 009480 sqlite3BtreeEnter(p); 009481 assert( p->inTrans==TRANS_WRITE ); 009482 assert( pBt->pPage1!=0 ); 009483 pP1 = pBt->pPage1->aData; 009484 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 009485 if( rc==SQLITE_OK ){ 009486 put4byte(&pP1[36 + idx*4], iMeta); 009487 #ifndef SQLITE_OMIT_AUTOVACUUM 009488 if( idx==BTREE_INCR_VACUUM ){ 009489 assert( pBt->autoVacuum || iMeta==0 ); 009490 assert( iMeta==0 || iMeta==1 ); 009491 pBt->incrVacuum = (u8)iMeta; 009492 } 009493 #endif 009494 } 009495 sqlite3BtreeLeave(p); 009496 return rc; 009497 } 009498 009499 #ifndef SQLITE_OMIT_BTREECOUNT 009500 /* 009501 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 009502 ** number of entries in the b-tree and write the result to *pnEntry. 009503 ** 009504 ** SQLITE_OK is returned if the operation is successfully executed. 009505 ** Otherwise, if an error is encountered (i.e. an IO error or database 009506 ** corruption) an SQLite error code is returned. 009507 */ 009508 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ 009509 i64 nEntry = 0; /* Value to return in *pnEntry */ 009510 int rc; /* Return code */ 009511 009512 rc = moveToRoot(pCur); 009513 if( rc==SQLITE_EMPTY ){ 009514 *pnEntry = 0; 009515 return SQLITE_OK; 009516 } 009517 009518 /* Unless an error occurs, the following loop runs one iteration for each 009519 ** page in the B-Tree structure (not including overflow pages). 009520 */ 009521 while( rc==SQLITE_OK && !db->u1.isInterrupted ){ 009522 int iIdx; /* Index of child node in parent */ 009523 MemPage *pPage; /* Current page of the b-tree */ 009524 009525 /* If this is a leaf page or the tree is not an int-key tree, then 009526 ** this page contains countable entries. Increment the entry counter 009527 ** accordingly. 009528 */ 009529 pPage = pCur->pPage; 009530 if( pPage->leaf || !pPage->intKey ){ 009531 nEntry += pPage->nCell; 009532 } 009533 009534 /* pPage is a leaf node. This loop navigates the cursor so that it 009535 ** points to the first interior cell that it points to the parent of 009536 ** the next page in the tree that has not yet been visited. The 009537 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 009538 ** of the page, or to the number of cells in the page if the next page 009539 ** to visit is the right-child of its parent. 009540 ** 009541 ** If all pages in the tree have been visited, return SQLITE_OK to the 009542 ** caller. 009543 */ 009544 if( pPage->leaf ){ 009545 do { 009546 if( pCur->iPage==0 ){ 009547 /* All pages of the b-tree have been visited. Return successfully. */ 009548 *pnEntry = nEntry; 009549 return moveToRoot(pCur); 009550 } 009551 moveToParent(pCur); 009552 }while ( pCur->ix>=pCur->pPage->nCell ); 009553 009554 pCur->ix++; 009555 pPage = pCur->pPage; 009556 } 009557 009558 /* Descend to the child node of the cell that the cursor currently 009559 ** points at. This is the right-child if (iIdx==pPage->nCell). 009560 */ 009561 iIdx = pCur->ix; 009562 if( iIdx==pPage->nCell ){ 009563 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 009564 }else{ 009565 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 009566 } 009567 } 009568 009569 /* An error has occurred. Return an error code. */ 009570 return rc; 009571 } 009572 #endif 009573 009574 /* 009575 ** Return the pager associated with a BTree. This routine is used for 009576 ** testing and debugging only. 009577 */ 009578 Pager *sqlite3BtreePager(Btree *p){ 009579 return p->pBt->pPager; 009580 } 009581 009582 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 009583 /* 009584 ** Append a message to the error message string. 009585 */ 009586 static void checkAppendMsg( 009587 IntegrityCk *pCheck, 009588 const char *zFormat, 009589 ... 009590 ){ 009591 va_list ap; 009592 if( !pCheck->mxErr ) return; 009593 pCheck->mxErr--; 009594 pCheck->nErr++; 009595 va_start(ap, zFormat); 009596 if( pCheck->errMsg.nChar ){ 009597 sqlite3_str_append(&pCheck->errMsg, "\n", 1); 009598 } 009599 if( pCheck->zPfx ){ 009600 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2); 009601 } 009602 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap); 009603 va_end(ap); 009604 if( pCheck->errMsg.accError==SQLITE_NOMEM ){ 009605 pCheck->mallocFailed = 1; 009606 } 009607 } 009608 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 009609 009610 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 009611 009612 /* 009613 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that 009614 ** corresponds to page iPg is already set. 009615 */ 009616 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 009617 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); 009618 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); 009619 } 009620 009621 /* 009622 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. 009623 */ 009624 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 009625 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); 009626 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); 009627 } 009628 009629 009630 /* 009631 ** Add 1 to the reference count for page iPage. If this is the second 009632 ** reference to the page, add an error message to pCheck->zErrMsg. 009633 ** Return 1 if there are 2 or more references to the page and 0 if 009634 ** if this is the first reference to the page. 009635 ** 009636 ** Also check that the page number is in bounds. 009637 */ 009638 static int checkRef(IntegrityCk *pCheck, Pgno iPage){ 009639 if( iPage>pCheck->nPage || iPage==0 ){ 009640 checkAppendMsg(pCheck, "invalid page number %d", iPage); 009641 return 1; 009642 } 009643 if( getPageReferenced(pCheck, iPage) ){ 009644 checkAppendMsg(pCheck, "2nd reference to page %d", iPage); 009645 return 1; 009646 } 009647 if( pCheck->db->u1.isInterrupted ) return 1; 009648 setPageReferenced(pCheck, iPage); 009649 return 0; 009650 } 009651 009652 #ifndef SQLITE_OMIT_AUTOVACUUM 009653 /* 009654 ** Check that the entry in the pointer-map for page iChild maps to 009655 ** page iParent, pointer type ptrType. If not, append an error message 009656 ** to pCheck. 009657 */ 009658 static void checkPtrmap( 009659 IntegrityCk *pCheck, /* Integrity check context */ 009660 Pgno iChild, /* Child page number */ 009661 u8 eType, /* Expected pointer map type */ 009662 Pgno iParent /* Expected pointer map parent page number */ 009663 ){ 009664 int rc; 009665 u8 ePtrmapType; 009666 Pgno iPtrmapParent; 009667 009668 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 009669 if( rc!=SQLITE_OK ){ 009670 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; 009671 checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild); 009672 return; 009673 } 009674 009675 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 009676 checkAppendMsg(pCheck, 009677 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 009678 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 009679 } 009680 } 009681 #endif 009682 009683 /* 009684 ** Check the integrity of the freelist or of an overflow page list. 009685 ** Verify that the number of pages on the list is N. 009686 */ 009687 static void checkList( 009688 IntegrityCk *pCheck, /* Integrity checking context */ 009689 int isFreeList, /* True for a freelist. False for overflow page list */ 009690 int iPage, /* Page number for first page in the list */ 009691 u32 N /* Expected number of pages in the list */ 009692 ){ 009693 int i; 009694 u32 expected = N; 009695 int nErrAtStart = pCheck->nErr; 009696 while( iPage!=0 && pCheck->mxErr ){ 009697 DbPage *pOvflPage; 009698 unsigned char *pOvflData; 009699 if( checkRef(pCheck, iPage) ) break; 009700 N--; 009701 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){ 009702 checkAppendMsg(pCheck, "failed to get page %d", iPage); 009703 break; 009704 } 009705 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 009706 if( isFreeList ){ 009707 u32 n = (u32)get4byte(&pOvflData[4]); 009708 #ifndef SQLITE_OMIT_AUTOVACUUM 009709 if( pCheck->pBt->autoVacuum ){ 009710 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); 009711 } 009712 #endif 009713 if( n>pCheck->pBt->usableSize/4-2 ){ 009714 checkAppendMsg(pCheck, 009715 "freelist leaf count too big on page %d", iPage); 009716 N--; 009717 }else{ 009718 for(i=0; i<(int)n; i++){ 009719 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 009720 #ifndef SQLITE_OMIT_AUTOVACUUM 009721 if( pCheck->pBt->autoVacuum ){ 009722 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); 009723 } 009724 #endif 009725 checkRef(pCheck, iFreePage); 009726 } 009727 N -= n; 009728 } 009729 } 009730 #ifndef SQLITE_OMIT_AUTOVACUUM 009731 else{ 009732 /* If this database supports auto-vacuum and iPage is not the last 009733 ** page in this overflow list, check that the pointer-map entry for 009734 ** the following page matches iPage. 009735 */ 009736 if( pCheck->pBt->autoVacuum && N>0 ){ 009737 i = get4byte(pOvflData); 009738 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); 009739 } 009740 } 009741 #endif 009742 iPage = get4byte(pOvflData); 009743 sqlite3PagerUnref(pOvflPage); 009744 } 009745 if( N && nErrAtStart==pCheck->nErr ){ 009746 checkAppendMsg(pCheck, 009747 "%s is %d but should be %d", 009748 isFreeList ? "size" : "overflow list length", 009749 expected-N, expected); 009750 } 009751 } 009752 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 009753 009754 /* 009755 ** An implementation of a min-heap. 009756 ** 009757 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the 009758 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2] 009759 ** and aHeap[N*2+1]. 009760 ** 009761 ** The heap property is this: Every node is less than or equal to both 009762 ** of its daughter nodes. A consequence of the heap property is that the 009763 ** root node aHeap[1] is always the minimum value currently in the heap. 009764 ** 009765 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto 009766 ** the heap, preserving the heap property. The btreeHeapPull() routine 009767 ** removes the root element from the heap (the minimum value in the heap) 009768 ** and then moves other nodes around as necessary to preserve the heap 009769 ** property. 009770 ** 009771 ** This heap is used for cell overlap and coverage testing. Each u32 009772 ** entry represents the span of a cell or freeblock on a btree page. 009773 ** The upper 16 bits are the index of the first byte of a range and the 009774 ** lower 16 bits are the index of the last byte of that range. 009775 */ 009776 static void btreeHeapInsert(u32 *aHeap, u32 x){ 009777 u32 j, i = ++aHeap[0]; 009778 aHeap[i] = x; 009779 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ 009780 x = aHeap[j]; 009781 aHeap[j] = aHeap[i]; 009782 aHeap[i] = x; 009783 i = j; 009784 } 009785 } 009786 static int btreeHeapPull(u32 *aHeap, u32 *pOut){ 009787 u32 j, i, x; 009788 if( (x = aHeap[0])==0 ) return 0; 009789 *pOut = aHeap[1]; 009790 aHeap[1] = aHeap[x]; 009791 aHeap[x] = 0xffffffff; 009792 aHeap[0]--; 009793 i = 1; 009794 while( (j = i*2)<=aHeap[0] ){ 009795 if( aHeap[j]>aHeap[j+1] ) j++; 009796 if( aHeap[i]<aHeap[j] ) break; 009797 x = aHeap[i]; 009798 aHeap[i] = aHeap[j]; 009799 aHeap[j] = x; 009800 i = j; 009801 } 009802 return 1; 009803 } 009804 009805 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 009806 /* 009807 ** Do various sanity checks on a single page of a tree. Return 009808 ** the tree depth. Root pages return 0. Parents of root pages 009809 ** return 1, and so forth. 009810 ** 009811 ** These checks are done: 009812 ** 009813 ** 1. Make sure that cells and freeblocks do not overlap 009814 ** but combine to completely cover the page. 009815 ** 2. Make sure integer cell keys are in order. 009816 ** 3. Check the integrity of overflow pages. 009817 ** 4. Recursively call checkTreePage on all children. 009818 ** 5. Verify that the depth of all children is the same. 009819 */ 009820 static int checkTreePage( 009821 IntegrityCk *pCheck, /* Context for the sanity check */ 009822 int iPage, /* Page number of the page to check */ 009823 i64 *piMinKey, /* Write minimum integer primary key here */ 009824 i64 maxKey /* Error if integer primary key greater than this */ 009825 ){ 009826 MemPage *pPage = 0; /* The page being analyzed */ 009827 int i; /* Loop counter */ 009828 int rc; /* Result code from subroutine call */ 009829 int depth = -1, d2; /* Depth of a subtree */ 009830 int pgno; /* Page number */ 009831 int nFrag; /* Number of fragmented bytes on the page */ 009832 int hdr; /* Offset to the page header */ 009833 int cellStart; /* Offset to the start of the cell pointer array */ 009834 int nCell; /* Number of cells */ 009835 int doCoverageCheck = 1; /* True if cell coverage checking should be done */ 009836 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey 009837 ** False if IPK must be strictly less than maxKey */ 009838 u8 *data; /* Page content */ 009839 u8 *pCell; /* Cell content */ 009840 u8 *pCellIdx; /* Next element of the cell pointer array */ 009841 BtShared *pBt; /* The BtShared object that owns pPage */ 009842 u32 pc; /* Address of a cell */ 009843 u32 usableSize; /* Usable size of the page */ 009844 u32 contentOffset; /* Offset to the start of the cell content area */ 009845 u32 *heap = 0; /* Min-heap used for checking cell coverage */ 009846 u32 x, prev = 0; /* Next and previous entry on the min-heap */ 009847 const char *saved_zPfx = pCheck->zPfx; 009848 int saved_v1 = pCheck->v1; 009849 int saved_v2 = pCheck->v2; 009850 u8 savedIsInit = 0; 009851 009852 /* Check that the page exists 009853 */ 009854 pBt = pCheck->pBt; 009855 usableSize = pBt->usableSize; 009856 if( iPage==0 ) return 0; 009857 if( checkRef(pCheck, iPage) ) return 0; 009858 pCheck->zPfx = "Page %d: "; 009859 pCheck->v1 = iPage; 009860 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 009861 checkAppendMsg(pCheck, 009862 "unable to get the page. error code=%d", rc); 009863 goto end_of_check; 009864 } 009865 009866 /* Clear MemPage.isInit to make sure the corruption detection code in 009867 ** btreeInitPage() is executed. */ 009868 savedIsInit = pPage->isInit; 009869 pPage->isInit = 0; 009870 if( (rc = btreeInitPage(pPage))!=0 ){ 009871 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 009872 checkAppendMsg(pCheck, 009873 "btreeInitPage() returns error code %d", rc); 009874 goto end_of_check; 009875 } 009876 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){ 009877 assert( rc==SQLITE_CORRUPT ); 009878 checkAppendMsg(pCheck, "free space corruption", rc); 009879 goto end_of_check; 009880 } 009881 data = pPage->aData; 009882 hdr = pPage->hdrOffset; 009883 009884 /* Set up for cell analysis */ 009885 pCheck->zPfx = "On tree page %d cell %d: "; 009886 contentOffset = get2byteNotZero(&data[hdr+5]); 009887 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 009888 009889 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 009890 ** number of cells on the page. */ 009891 nCell = get2byte(&data[hdr+3]); 009892 assert( pPage->nCell==nCell ); 009893 009894 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page 009895 ** immediately follows the b-tree page header. */ 009896 cellStart = hdr + 12 - 4*pPage->leaf; 009897 assert( pPage->aCellIdx==&data[cellStart] ); 009898 pCellIdx = &data[cellStart + 2*(nCell-1)]; 009899 009900 if( !pPage->leaf ){ 009901 /* Analyze the right-child page of internal pages */ 009902 pgno = get4byte(&data[hdr+8]); 009903 #ifndef SQLITE_OMIT_AUTOVACUUM 009904 if( pBt->autoVacuum ){ 009905 pCheck->zPfx = "On page %d at right child: "; 009906 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 009907 } 009908 #endif 009909 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey); 009910 keyCanBeEqual = 0; 009911 }else{ 009912 /* For leaf pages, the coverage check will occur in the same loop 009913 ** as the other cell checks, so initialize the heap. */ 009914 heap = pCheck->heap; 009915 heap[0] = 0; 009916 } 009917 009918 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte 009919 ** integer offsets to the cell contents. */ 009920 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){ 009921 CellInfo info; 009922 009923 /* Check cell size */ 009924 pCheck->v2 = i; 009925 assert( pCellIdx==&data[cellStart + i*2] ); 009926 pc = get2byteAligned(pCellIdx); 009927 pCellIdx -= 2; 009928 if( pc<contentOffset || pc>usableSize-4 ){ 009929 checkAppendMsg(pCheck, "Offset %d out of range %d..%d", 009930 pc, contentOffset, usableSize-4); 009931 doCoverageCheck = 0; 009932 continue; 009933 } 009934 pCell = &data[pc]; 009935 pPage->xParseCell(pPage, pCell, &info); 009936 if( pc+info.nSize>usableSize ){ 009937 checkAppendMsg(pCheck, "Extends off end of page"); 009938 doCoverageCheck = 0; 009939 continue; 009940 } 009941 009942 /* Check for integer primary key out of range */ 009943 if( pPage->intKey ){ 009944 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){ 009945 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey); 009946 } 009947 maxKey = info.nKey; 009948 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */ 009949 } 009950 009951 /* Check the content overflow list */ 009952 if( info.nPayload>info.nLocal ){ 009953 u32 nPage; /* Number of pages on the overflow chain */ 009954 Pgno pgnoOvfl; /* First page of the overflow chain */ 009955 assert( pc + info.nSize - 4 <= usableSize ); 009956 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4); 009957 pgnoOvfl = get4byte(&pCell[info.nSize - 4]); 009958 #ifndef SQLITE_OMIT_AUTOVACUUM 009959 if( pBt->autoVacuum ){ 009960 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); 009961 } 009962 #endif 009963 checkList(pCheck, 0, pgnoOvfl, nPage); 009964 } 009965 009966 if( !pPage->leaf ){ 009967 /* Check sanity of left child page for internal pages */ 009968 pgno = get4byte(pCell); 009969 #ifndef SQLITE_OMIT_AUTOVACUUM 009970 if( pBt->autoVacuum ){ 009971 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 009972 } 009973 #endif 009974 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey); 009975 keyCanBeEqual = 0; 009976 if( d2!=depth ){ 009977 checkAppendMsg(pCheck, "Child page depth differs"); 009978 depth = d2; 009979 } 009980 }else{ 009981 /* Populate the coverage-checking heap for leaf pages */ 009982 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1)); 009983 } 009984 } 009985 *piMinKey = maxKey; 009986 009987 /* Check for complete coverage of the page 009988 */ 009989 pCheck->zPfx = 0; 009990 if( doCoverageCheck && pCheck->mxErr>0 ){ 009991 /* For leaf pages, the min-heap has already been initialized and the 009992 ** cells have already been inserted. But for internal pages, that has 009993 ** not yet been done, so do it now */ 009994 if( !pPage->leaf ){ 009995 heap = pCheck->heap; 009996 heap[0] = 0; 009997 for(i=nCell-1; i>=0; i--){ 009998 u32 size; 009999 pc = get2byteAligned(&data[cellStart+i*2]); 010000 size = pPage->xCellSize(pPage, &data[pc]); 010001 btreeHeapInsert(heap, (pc<<16)|(pc+size-1)); 010002 } 010003 } 010004 /* Add the freeblocks to the min-heap 010005 ** 010006 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header 010007 ** is the offset of the first freeblock, or zero if there are no 010008 ** freeblocks on the page. 010009 */ 010010 i = get2byte(&data[hdr+1]); 010011 while( i>0 ){ 010012 int size, j; 010013 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010014 size = get2byte(&data[i+2]); 010015 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */ 010016 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1)); 010017 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a 010018 ** big-endian integer which is the offset in the b-tree page of the next 010019 ** freeblock in the chain, or zero if the freeblock is the last on the 010020 ** chain. */ 010021 j = get2byte(&data[i]); 010022 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of 010023 ** increasing offset. */ 010024 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */ 010025 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010026 i = j; 010027 } 010028 /* Analyze the min-heap looking for overlap between cells and/or 010029 ** freeblocks, and counting the number of untracked bytes in nFrag. 010030 ** 010031 ** Each min-heap entry is of the form: (start_address<<16)|end_address. 010032 ** There is an implied first entry the covers the page header, the cell 010033 ** pointer index, and the gap between the cell pointer index and the start 010034 ** of cell content. 010035 ** 010036 ** The loop below pulls entries from the min-heap in order and compares 010037 ** the start_address against the previous end_address. If there is an 010038 ** overlap, that means bytes are used multiple times. If there is a gap, 010039 ** that gap is added to the fragmentation count. 010040 */ 010041 nFrag = 0; 010042 prev = contentOffset - 1; /* Implied first min-heap entry */ 010043 while( btreeHeapPull(heap,&x) ){ 010044 if( (prev&0xffff)>=(x>>16) ){ 010045 checkAppendMsg(pCheck, 010046 "Multiple uses for byte %u of page %d", x>>16, iPage); 010047 break; 010048 }else{ 010049 nFrag += (x>>16) - (prev&0xffff) - 1; 010050 prev = x; 010051 } 010052 } 010053 nFrag += usableSize - (prev&0xffff) - 1; 010054 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments 010055 ** is stored in the fifth field of the b-tree page header. 010056 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the 010057 ** number of fragmented free bytes within the cell content area. 010058 */ 010059 if( heap[0]==0 && nFrag!=data[hdr+7] ){ 010060 checkAppendMsg(pCheck, 010061 "Fragmentation of %d bytes reported as %d on page %d", 010062 nFrag, data[hdr+7], iPage); 010063 } 010064 } 010065 010066 end_of_check: 010067 if( !doCoverageCheck ) pPage->isInit = savedIsInit; 010068 releasePage(pPage); 010069 pCheck->zPfx = saved_zPfx; 010070 pCheck->v1 = saved_v1; 010071 pCheck->v2 = saved_v2; 010072 return depth+1; 010073 } 010074 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010075 010076 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010077 /* 010078 ** This routine does a complete check of the given BTree file. aRoot[] is 010079 ** an array of pages numbers were each page number is the root page of 010080 ** a table. nRoot is the number of entries in aRoot. 010081 ** 010082 ** A read-only or read-write transaction must be opened before calling 010083 ** this function. 010084 ** 010085 ** Write the number of error seen in *pnErr. Except for some memory 010086 ** allocation errors, an error message held in memory obtained from 010087 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 010088 ** returned. If a memory allocation error occurs, NULL is returned. 010089 */ 010090 char *sqlite3BtreeIntegrityCheck( 010091 sqlite3 *db, /* Database connection that is running the check */ 010092 Btree *p, /* The btree to be checked */ 010093 int *aRoot, /* An array of root pages numbers for individual trees */ 010094 int nRoot, /* Number of entries in aRoot[] */ 010095 int mxErr, /* Stop reporting errors after this many */ 010096 int *pnErr /* Write number of errors seen to this variable */ 010097 ){ 010098 Pgno i; 010099 IntegrityCk sCheck; 010100 BtShared *pBt = p->pBt; 010101 u64 savedDbFlags = pBt->db->flags; 010102 char zErr[100]; 010103 VVA_ONLY( int nRef ); 010104 010105 sqlite3BtreeEnter(p); 010106 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 010107 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) ); 010108 assert( nRef>=0 ); 010109 sCheck.db = db; 010110 sCheck.pBt = pBt; 010111 sCheck.pPager = pBt->pPager; 010112 sCheck.nPage = btreePagecount(sCheck.pBt); 010113 sCheck.mxErr = mxErr; 010114 sCheck.nErr = 0; 010115 sCheck.mallocFailed = 0; 010116 sCheck.zPfx = 0; 010117 sCheck.v1 = 0; 010118 sCheck.v2 = 0; 010119 sCheck.aPgRef = 0; 010120 sCheck.heap = 0; 010121 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); 010122 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL; 010123 if( sCheck.nPage==0 ){ 010124 goto integrity_ck_cleanup; 010125 } 010126 010127 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1); 010128 if( !sCheck.aPgRef ){ 010129 sCheck.mallocFailed = 1; 010130 goto integrity_ck_cleanup; 010131 } 010132 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); 010133 if( sCheck.heap==0 ){ 010134 sCheck.mallocFailed = 1; 010135 goto integrity_ck_cleanup; 010136 } 010137 010138 i = PENDING_BYTE_PAGE(pBt); 010139 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i); 010140 010141 /* Check the integrity of the freelist 010142 */ 010143 sCheck.zPfx = "Main freelist: "; 010144 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 010145 get4byte(&pBt->pPage1->aData[36])); 010146 sCheck.zPfx = 0; 010147 010148 /* Check all the tables. 010149 */ 010150 #ifndef SQLITE_OMIT_AUTOVACUUM 010151 if( pBt->autoVacuum ){ 010152 int mx = 0; 010153 int mxInHdr; 010154 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; 010155 mxInHdr = get4byte(&pBt->pPage1->aData[52]); 010156 if( mx!=mxInHdr ){ 010157 checkAppendMsg(&sCheck, 010158 "max rootpage (%d) disagrees with header (%d)", 010159 mx, mxInHdr 010160 ); 010161 } 010162 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ 010163 checkAppendMsg(&sCheck, 010164 "incremental_vacuum enabled with a max rootpage of zero" 010165 ); 010166 } 010167 #endif 010168 testcase( pBt->db->flags & SQLITE_CellSizeCk ); 010169 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk; 010170 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 010171 i64 notUsed; 010172 if( aRoot[i]==0 ) continue; 010173 #ifndef SQLITE_OMIT_AUTOVACUUM 010174 if( pBt->autoVacuum && aRoot[i]>1 ){ 010175 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); 010176 } 010177 #endif 010178 checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); 010179 } 010180 pBt->db->flags = savedDbFlags; 010181 010182 /* Make sure every page in the file is referenced 010183 */ 010184 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 010185 #ifdef SQLITE_OMIT_AUTOVACUUM 010186 if( getPageReferenced(&sCheck, i)==0 ){ 010187 checkAppendMsg(&sCheck, "Page %d is never used", i); 010188 } 010189 #else 010190 /* If the database supports auto-vacuum, make sure no tables contain 010191 ** references to pointer-map pages. 010192 */ 010193 if( getPageReferenced(&sCheck, i)==0 && 010194 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 010195 checkAppendMsg(&sCheck, "Page %d is never used", i); 010196 } 010197 if( getPageReferenced(&sCheck, i)!=0 && 010198 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 010199 checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i); 010200 } 010201 #endif 010202 } 010203 010204 /* Clean up and report errors. 010205 */ 010206 integrity_ck_cleanup: 010207 sqlite3PageFree(sCheck.heap); 010208 sqlite3_free(sCheck.aPgRef); 010209 if( sCheck.mallocFailed ){ 010210 sqlite3_str_reset(&sCheck.errMsg); 010211 sCheck.nErr++; 010212 } 010213 *pnErr = sCheck.nErr; 010214 if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg); 010215 /* Make sure this analysis did not leave any unref() pages. */ 010216 assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); 010217 sqlite3BtreeLeave(p); 010218 return sqlite3StrAccumFinish(&sCheck.errMsg); 010219 } 010220 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010221 010222 /* 010223 ** Return the full pathname of the underlying database file. Return 010224 ** an empty string if the database is in-memory or a TEMP database. 010225 ** 010226 ** The pager filename is invariant as long as the pager is 010227 ** open so it is safe to access without the BtShared mutex. 010228 */ 010229 const char *sqlite3BtreeGetFilename(Btree *p){ 010230 assert( p->pBt->pPager!=0 ); 010231 return sqlite3PagerFilename(p->pBt->pPager, 1); 010232 } 010233 010234 /* 010235 ** Return the pathname of the journal file for this database. The return 010236 ** value of this routine is the same regardless of whether the journal file 010237 ** has been created or not. 010238 ** 010239 ** The pager journal filename is invariant as long as the pager is 010240 ** open so it is safe to access without the BtShared mutex. 010241 */ 010242 const char *sqlite3BtreeGetJournalname(Btree *p){ 010243 assert( p->pBt->pPager!=0 ); 010244 return sqlite3PagerJournalname(p->pBt->pPager); 010245 } 010246 010247 /* 010248 ** Return non-zero if a transaction is active. 010249 */ 010250 int sqlite3BtreeIsInTrans(Btree *p){ 010251 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 010252 return (p && (p->inTrans==TRANS_WRITE)); 010253 } 010254 010255 #ifndef SQLITE_OMIT_WAL 010256 /* 010257 ** Run a checkpoint on the Btree passed as the first argument. 010258 ** 010259 ** Return SQLITE_LOCKED if this or any other connection has an open 010260 ** transaction on the shared-cache the argument Btree is connected to. 010261 ** 010262 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 010263 */ 010264 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 010265 int rc = SQLITE_OK; 010266 if( p ){ 010267 BtShared *pBt = p->pBt; 010268 sqlite3BtreeEnter(p); 010269 if( pBt->inTransaction!=TRANS_NONE ){ 010270 rc = SQLITE_LOCKED; 010271 }else{ 010272 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt); 010273 } 010274 sqlite3BtreeLeave(p); 010275 } 010276 return rc; 010277 } 010278 #endif 010279 010280 /* 010281 ** Return non-zero if a read (or write) transaction is active. 010282 */ 010283 int sqlite3BtreeIsInReadTrans(Btree *p){ 010284 assert( p ); 010285 assert( sqlite3_mutex_held(p->db->mutex) ); 010286 return p->inTrans!=TRANS_NONE; 010287 } 010288 010289 int sqlite3BtreeIsInBackup(Btree *p){ 010290 assert( p ); 010291 assert( sqlite3_mutex_held(p->db->mutex) ); 010292 return p->nBackup!=0; 010293 } 010294 010295 /* 010296 ** This function returns a pointer to a blob of memory associated with 010297 ** a single shared-btree. The memory is used by client code for its own 010298 ** purposes (for example, to store a high-level schema associated with 010299 ** the shared-btree). The btree layer manages reference counting issues. 010300 ** 010301 ** The first time this is called on a shared-btree, nBytes bytes of memory 010302 ** are allocated, zeroed, and returned to the caller. For each subsequent 010303 ** call the nBytes parameter is ignored and a pointer to the same blob 010304 ** of memory returned. 010305 ** 010306 ** If the nBytes parameter is 0 and the blob of memory has not yet been 010307 ** allocated, a null pointer is returned. If the blob has already been 010308 ** allocated, it is returned as normal. 010309 ** 010310 ** Just before the shared-btree is closed, the function passed as the 010311 ** xFree argument when the memory allocation was made is invoked on the 010312 ** blob of allocated memory. The xFree function should not call sqlite3_free() 010313 ** on the memory, the btree layer does that. 010314 */ 010315 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 010316 BtShared *pBt = p->pBt; 010317 sqlite3BtreeEnter(p); 010318 if( !pBt->pSchema && nBytes ){ 010319 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 010320 pBt->xFreeSchema = xFree; 010321 } 010322 sqlite3BtreeLeave(p); 010323 return pBt->pSchema; 010324 } 010325 010326 /* 010327 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 010328 ** btree as the argument handle holds an exclusive lock on the 010329 ** sqlite_master table. Otherwise SQLITE_OK. 010330 */ 010331 int sqlite3BtreeSchemaLocked(Btree *p){ 010332 int rc; 010333 assert( sqlite3_mutex_held(p->db->mutex) ); 010334 sqlite3BtreeEnter(p); 010335 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 010336 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 010337 sqlite3BtreeLeave(p); 010338 return rc; 010339 } 010340 010341 010342 #ifndef SQLITE_OMIT_SHARED_CACHE 010343 /* 010344 ** Obtain a lock on the table whose root page is iTab. The 010345 ** lock is a write lock if isWritelock is true or a read lock 010346 ** if it is false. 010347 */ 010348 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 010349 int rc = SQLITE_OK; 010350 assert( p->inTrans!=TRANS_NONE ); 010351 if( p->sharable ){ 010352 u8 lockType = READ_LOCK + isWriteLock; 010353 assert( READ_LOCK+1==WRITE_LOCK ); 010354 assert( isWriteLock==0 || isWriteLock==1 ); 010355 010356 sqlite3BtreeEnter(p); 010357 rc = querySharedCacheTableLock(p, iTab, lockType); 010358 if( rc==SQLITE_OK ){ 010359 rc = setSharedCacheTableLock(p, iTab, lockType); 010360 } 010361 sqlite3BtreeLeave(p); 010362 } 010363 return rc; 010364 } 010365 #endif 010366 010367 #ifndef SQLITE_OMIT_INCRBLOB 010368 /* 010369 ** Argument pCsr must be a cursor opened for writing on an 010370 ** INTKEY table currently pointing at a valid table entry. 010371 ** This function modifies the data stored as part of that entry. 010372 ** 010373 ** Only the data content may only be modified, it is not possible to 010374 ** change the length of the data stored. If this function is called with 010375 ** parameters that attempt to write past the end of the existing data, 010376 ** no modifications are made and SQLITE_CORRUPT is returned. 010377 */ 010378 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 010379 int rc; 010380 assert( cursorOwnsBtShared(pCsr) ); 010381 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 010382 assert( pCsr->curFlags & BTCF_Incrblob ); 010383 010384 rc = restoreCursorPosition(pCsr); 010385 if( rc!=SQLITE_OK ){ 010386 return rc; 010387 } 010388 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 010389 if( pCsr->eState!=CURSOR_VALID ){ 010390 return SQLITE_ABORT; 010391 } 010392 010393 /* Save the positions of all other cursors open on this table. This is 010394 ** required in case any of them are holding references to an xFetch 010395 ** version of the b-tree page modified by the accessPayload call below. 010396 ** 010397 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() 010398 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence 010399 ** saveAllCursors can only return SQLITE_OK. 010400 */ 010401 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); 010402 assert( rc==SQLITE_OK ); 010403 010404 /* Check some assumptions: 010405 ** (a) the cursor is open for writing, 010406 ** (b) there is a read/write transaction open, 010407 ** (c) the connection holds a write-lock on the table (if required), 010408 ** (d) there are no conflicting read-locks, and 010409 ** (e) the cursor points at a valid row of an intKey table. 010410 */ 010411 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){ 010412 return SQLITE_READONLY; 010413 } 010414 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0 010415 && pCsr->pBt->inTransaction==TRANS_WRITE ); 010416 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 010417 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 010418 assert( pCsr->pPage->intKey ); 010419 010420 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 010421 } 010422 010423 /* 010424 ** Mark this cursor as an incremental blob cursor. 010425 */ 010426 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ 010427 pCur->curFlags |= BTCF_Incrblob; 010428 pCur->pBtree->hasIncrblobCur = 1; 010429 } 010430 #endif 010431 010432 /* 010433 ** Set both the "read version" (single byte at byte offset 18) and 010434 ** "write version" (single byte at byte offset 19) fields in the database 010435 ** header to iVersion. 010436 */ 010437 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 010438 BtShared *pBt = pBtree->pBt; 010439 int rc; /* Return code */ 010440 010441 assert( iVersion==1 || iVersion==2 ); 010442 010443 /* If setting the version fields to 1, do not automatically open the 010444 ** WAL connection, even if the version fields are currently set to 2. 010445 */ 010446 pBt->btsFlags &= ~BTS_NO_WAL; 010447 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL; 010448 010449 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0); 010450 if( rc==SQLITE_OK ){ 010451 u8 *aData = pBt->pPage1->aData; 010452 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 010453 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0); 010454 if( rc==SQLITE_OK ){ 010455 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 010456 if( rc==SQLITE_OK ){ 010457 aData[18] = (u8)iVersion; 010458 aData[19] = (u8)iVersion; 010459 } 010460 } 010461 } 010462 } 010463 010464 pBt->btsFlags &= ~BTS_NO_WAL; 010465 return rc; 010466 } 010467 010468 /* 010469 ** Return true if the cursor has a hint specified. This routine is 010470 ** only used from within assert() statements 010471 */ 010472 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){ 010473 return (pCsr->hints & mask)!=0; 010474 } 010475 010476 /* 010477 ** Return true if the given Btree is read-only. 010478 */ 010479 int sqlite3BtreeIsReadonly(Btree *p){ 010480 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; 010481 } 010482 010483 /* 010484 ** Return the size of the header added to each page by this module. 010485 */ 010486 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } 010487 010488 #if !defined(SQLITE_OMIT_SHARED_CACHE) 010489 /* 010490 ** Return true if the Btree passed as the only argument is sharable. 010491 */ 010492 int sqlite3BtreeSharable(Btree *p){ 010493 return p->sharable; 010494 } 010495 010496 /* 010497 ** Return the number of connections to the BtShared object accessed by 010498 ** the Btree handle passed as the only argument. For private caches 010499 ** this is always 1. For shared caches it may be 1 or greater. 010500 */ 010501 int sqlite3BtreeConnectionCount(Btree *p){ 010502 testcase( p->sharable ); 010503 return p->pBt->nRef; 010504 } 010505 #endif