File: ContentIdentifier.hs

package info (click to toggle)
git-annex 10.20250416-2
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 73,572 kB
  • sloc: haskell: 90,656; javascript: 9,103; sh: 1,469; makefile: 211; perl: 137; ansic: 44
file content (190 lines) | stat: -rw-r--r-- 6,674 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
{- Sqlite database of ContentIdentifiers imported from special remotes.
 -
 - Copyright 2019-2023 Joey Hess <id@joeyh.name>
 -:
 - Licensed under the GNU AGPL version 3 or higher.
 -}

{-# LANGUAGE CPP #-}
{-# LANGUAGE QuasiQuotes, TypeFamilies, TypeOperators, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts, EmptyDataDecls #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE DataKinds, FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}

module Database.ContentIdentifier (
	ContentIdentifierHandle,
	databaseIsEmpty,
	openDb,
	closeDb,
	flushDbQueue,
	recordContentIdentifier,
	getContentIdentifiers,
	getContentIdentifierKeys,
	recordAnnexBranchTree,
	getAnnexBranchTree,
	needsUpdateFromLog,
	updateFromLog,
	ContentIdentifiersId,
	AnnexBranchId,
) where

import Database.Types
import qualified Database.Queue as H
import Database.Init
import Database.Utility
import Annex.Locations
import Annex.Common hiding (delete)
import qualified Annex.Branch
import Types.Import
import Types.RemoteState
import Git.Types
import Git.Sha
import Git.FilePath
import qualified Git.DiffTree as DiffTree
import Logs
import qualified Logs.ContentIdentifier as Log

import Database.Persist.Sql hiding (Key)
import Database.Persist.TH

#if MIN_VERSION_persistent_sqlite(2,13,3)
import Database.RawFilePath
#else
import Database.Persist.Sqlite (runSqlite)
import qualified Data.Text as T
#endif

data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue Bool

databaseIsEmpty :: ContentIdentifierHandle -> Bool
databaseIsEmpty (ContentIdentifierHandle _ b) = b

-- Note on indexes: ContentIndentifiersKeyRemoteCidIndex etc are really
-- uniqueness constraints, which cause sqlite to automatically add indexes.
-- So when adding indexes, have to take care to only add ones that work as
-- uniqueness constraints. (Unfortunately persistent does not support indexes
-- that are not uniqueness constraints; 
-- https://github.com/yesodweb/persistent/issues/109)
-- 
-- ContentIndentifiersKeyRemoteCidIndex speeds up queries like 
-- getContentIdentifiers, but it is not used for
-- getContentIdentifierKeys. ContentIndentifiersCidRemoteKeyIndex was
-- addedto speed that up.
share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase|
ContentIdentifiers
  remote UUID
  cid ContentIdentifier
  key Key
  ContentIndentifiersKeyRemoteCidIndex key remote cid
  ContentIndentifiersCidRemoteKeyIndex cid remote key
-- The last git-annex branch tree sha that was used to update
-- ContentIdentifiers
AnnexBranch
  tree SSha
  UniqueTree tree
|]

{- Opens the database, creating it if it doesn't exist yet.
 -
 - Only a single process should write to the database at a time, so guard
 - any writes with the gitAnnexContentIdentifierLock.
 -}
openDb :: Annex ContentIdentifierHandle
openDb = do
	dbdir <- calcRepo' gitAnnexContentIdentifierDbDir
	let db = dbdir </> literalOsPath "db"
	isnew <- liftIO $ not <$> doesFileExist db
	if isnew
		then initDb db $ void $ 
			runMigrationSilent migrateContentIdentifier
		-- Migrate from old versions of database, which had buggy
		-- and suboptimal uniqueness constraints.
#if MIN_VERSION_persistent_sqlite(2,13,3)
		else liftIO $ runSqlite' (fromOsPath db) $ void $
			runMigrationSilent migrateContentIdentifier
#else
		else liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $
			runMigrationSilent migrateContentIdentifier
#endif
	h <- liftIO $ H.openDbQueue db "content_identifiers"
	return $ ContentIdentifierHandle h isnew

closeDb :: ContentIdentifierHandle -> Annex ()
closeDb (ContentIdentifierHandle h _) = liftIO $ H.closeDbQueue h

queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO ()
queueDb (ContentIdentifierHandle h _) = H.queueDb h checkcommit
  where
	-- commit queue after 1000 changes
	checkcommit sz _lastcommittime
		| sz > 1000 = return True
		| otherwise = return False

flushDbQueue :: ContentIdentifierHandle -> IO ()
flushDbQueue (ContentIdentifierHandle h _) = H.flushDbQueue h

-- Be sure to also update the git-annex branch when using this.
recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO ()
recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do
	void $ insertUniqueFast $ ContentIdentifiers u cid k

getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier]
getContentIdentifiers (ContentIdentifierHandle h _) (RemoteStateHandle u) k = 
	H.queryDbQueue h $ do
		l <- selectList
			[ ContentIdentifiersKey ==. k
			, ContentIdentifiersRemote ==. u
			] []
		return $ map (contentIdentifiersCid . entityVal) l

getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key]
getContentIdentifierKeys (ContentIdentifierHandle h _) (RemoteStateHandle u) cid = 
	H.queryDbQueue h $ do
		l <- selectList
			[ ContentIdentifiersCid ==. cid
			, ContentIdentifiersRemote ==. u
			] []
		return $ map (contentIdentifiersKey . entityVal) l

recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
recordAnnexBranchTree h s = queueDb h $ do
	deleteWhere ([] :: [Filter AnnexBranch])
	void $ insertUniqueFast $ AnnexBranch $ toSSha s

getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
getAnnexBranchTree (ContentIdentifierHandle h _) = H.queryDbQueue h $ do
	l <- selectList ([] :: [Filter AnnexBranch]) []
	case l of
		(s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s
		_ -> return emptyTree

{- Check if the git-annex branch has been updated and the database needs
 - to be updated with any new content identifiers in it. -}
needsUpdateFromLog :: ContentIdentifierHandle -> Annex (Maybe (Sha, Sha))
needsUpdateFromLog db = do
	oldtree <- liftIO $ getAnnexBranchTree db
	Annex.Branch.updatedFromTree oldtree

{- The database should be locked for write when calling this. -}
updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ContentIdentifierHandle
updateFromLog db@(ContentIdentifierHandle h _) (oldtree, currtree) = do
	(l, cleanup) <- inRepo $
		DiffTree.diffTreeRecursive oldtree currtree
	mapM_ go l
	void $ liftIO $ cleanup
	liftIO $ do
		recordAnnexBranchTree db currtree
		flushDbQueue db
	return (ContentIdentifierHandle h False)
  where
	go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
		Nothing -> return ()
		Just k -> do
			l <- Log.getContentIdentifiers k
			liftIO $ forM_ l $ \(rs, cids) ->
				forM_ cids $ \cid ->
					recordContentIdentifier db rs cid k