This repository has been archived by the owner on Jan 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 20
/
castorSvcClassSiteMover.py
340 lines (292 loc) · 15.1 KB
/
castorSvcClassSiteMover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import os
import commands
import re
import SiteMover
from futil import *
from PilotErrors import PilotErrors
from pUtil import tolog, readpar, verifySetupCommand
from time import time
from FileStateClient import updateFileState
from timed_command import timed_command
class castorSvcClassSiteMover(SiteMover.SiteMover):
"""
SiteMover for CASTOR, which finds the correct service class from which to stage in
files via rfcp.
"""
copyCommand = "rfcpsvcclass"
checksum_command = "adler32"
has_mkdir = True
has_df = False
has_getsize = True
has_md5sum = False
has_chmod = True
timeout = 5*3600
def __init__(self, setup_path='', *args, **kwrds):
self._setup = setup_path
def get_timeout(self):
return self.timeout
def _check_space(self, ub):
"""CASTOR specific space verification.
There is no simple way at the moment to verify CASTOR space availability - check info system instead"""
return 999999
def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
""" The local file is assubed to have a relative path that is the same of the relative path in the 'gpfn'
loc_... are the variables used to access the file in the locally exported file system
TODO: document GPFN format (SURL from catalog srm://host/path)
TODO: document better constraint
"""
error = PilotErrors()
pilotErrorDiag = ""
# Get input parameters from pdict
useCT = pdict.get('usect', True)
jobId = pdict.get('jobId', '')
workDir = pdict.get('workDir', '')
prodDBlockToken = pdict.get('access', '')
# get the Rucio tracing report
report = self.getStubTracingReport(pdict['report'], 'castorSVC', lfn, guid)
# get a proper envsetup
envsetup = self.getEnvsetup(get=True)
# Hard code the configuration dictionary for now, but eventually this should be
# set dynamically.
#
# There are the following configuration sections:
# setup - base environment veriables to be set
# svcClassMap - dictionary of string matches vs. service class names
# svcClassList - list of all service classes in case the svcClassMap matching fails
# svcClassDefault - the service class to set if the file appears to be staged no where
#
# Information from RAL:
# [root@srm0661 ~]# listStorageArea -v atlas
# <Space Token> <Description> <service class> <type> <status>
# 4948ef55-0000-1000-b7dd-9b38bdd87201 "ATLASGROUP" "atlasStripDeg" "DURABLE" "ALLOCATED"
# 4948ef38-0000-1000-8606-973e4e998e02 "ATLASMCDISK" "atlasSimStrip" "DURABLE" "ALLOCATED"
# 4948eec6-0000-1000-8ca2-aba0529b4806 "ATLASDATADISK" "atlasStripInput" "DURABLE" "ALLOCATED"
# 4948ee8e-0000-1000-9ac5-81bb9b34ba7b "ATLASMCTAPE" "atlasSimRaw" "PERMANENT" "ALLOCATED"
# 4948ee71-0000-1000-b611-a0afad31f6c8 "ATLASDATATAPE" "atlasT0Raw" "PERMANENT" "ALLOCATED"
# "ATLASHOTDISK" "atlasHotDisk"
# In addition there is the "atlasFarm" class, which is used when data is staged back from tape
castorConfig = {
'setup' : {
'STAGE_HOST' : 'catlasstager.ads.rl.ac.uk',
'STAGER_HOST' : 'catlasstager.ads.rl.ac.uk',
'RFIO_USE_CASTOR_V2' : 'YES',
},
'svcClassList' : ('atlasHotDisk', 'atlasSimStrip', 'atlasStripInput', 'atlasFarm', 'atlasStripDeg', 'atlasT0Raw', 'atlasSimRaw', 'atlasScratchDisk', ),
'svcClassMap' : {
'/atlashotdisk/' : 'atlasHotDisk',
'/atlasmcdisk/' : 'atlasStripInput',
'/atlasdatadisk/' : 'atlasStripInput',
'/atlasgroupdisk/' : 'atlasStripDeg',
'/atlasdatatape/' : 'atlasFarm',
'/atlasmctape/' : 'atlasFarm',
'/atlasscratchdisk/' : 'atlasScratchDisk',
'/atlasProdDisk/' : 'atlasScratchDisk',
},
'svcClassDefault' : 'atlasFarm',
}
# Set all environment variables for castor setup
for envVar, value in castorConfig['setup'].iteritems():
os.environ[envVar] = value
# Strip the gpfn (SURL) back to its bare castor component
tolog("gpfn is %s" % gpfn)
if self._setup:
_setup_str = "source %s; " % self._setup
else:
_setup_str = envsetup
ec, pilotErrorDiag = verifySetupCommand(error, _setup_str)
if ec != 0:
self.prepareReport('RFCP_FAIL', report)
return ec, pilotErrorDiag
loc_pfn = ''
if( gpfn.find('SFN') != -1 ):
s = gpfn.split('SFN=')
loc_pfn = s[1]
tolog("Found SFN string. Local file name %s" % loc_pfn)
else:
_tmp = gpfn.split('/', 3)
loc_pfn = '/'+_tmp[3]
tolog("Splitting SURL on slashes. Got local file name %s" % loc_pfn)
if not loc_pfn.startswith('/castor/'):
tolog("WARNING: Problem with local filename: Does not start with '/castor/'.")
# should the root file be copied or read directly by athena?
directIn, useFileStager = self.getTransferModes()
if directIn:
if useCT:
directIn = False
tolog("Direct access mode is switched off (file will be transferred with the copy tool)")
updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input")
else:
# determine if the file is a root file according to its name
rootFile = self.isRootFileName(lfn)
if prodDBlockToken == 'local' or not rootFile:
directIn = False
tolog("Direct access mode has been switched off for this file (will be transferred with the copy tool)")
updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input")
elif rootFile:
tolog("Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn))
report['relativeStart'] = None
report['transferStart'] = None
self.prepareReport('FOUND_ROOT', report)
if useFileStager:
updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", ftype="input")
else:
updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", ftype="input")
return error.ERR_DIRECTIOFILE, pilotErrorDiag
else:
tolog("Normal file transfer")
# Now need to find the service class associated with the file.
# If we find a clear indication of a space token in the file path
# then this is easy. However, if we don't, then use stager_qry to
# interrogate each possible service class. If this fails then use
# atlasFarm in desperation.
serviceClass = None
for pathMatch, svcClass in castorConfig['svcClassMap'].iteritems():
if loc_pfn.find(pathMatch) >= 0:
tolog('Matched path element %s - service class is %s' % (pathMatch, svcClass))
serviceClass = svcClass
break
else:
tolog('Path element %s for service class %s - no match' % (pathMatch, svcClass))
# For testing the fallback, then we need to hobble ourselves by unsetting serviceClass:
#tolog('Automatic service class was: %s' % serviceClass)
#tolog('Unsetting service class for fallback testing')
#serviceClass = None
if serviceClass == None:
tolog("Warning: Failed to find service class hint in SURL.")
for tryMe in castorConfig['svcClassList']:
os.environ['STAGE_SVCCLASS'] = tryMe
tolog('Trying service class %s for file' % tryMe)
err, output = commands.getstatusoutput('stager_qry -M %s' % loc_pfn)
if err != 0:
tolog('WARNING: Unexpected status from stager_qry: %d\n%s' % (err, output))
else:
if output.find('STAGED') >= 0:
tolog('Found file in service class %s' % tryMe)
serviceClass = tryMe
break
else:
tolog('File not found in service class %s' % tryMe)
if serviceClass == None:
tolog('WARNING: Failed to find file in any expected service class - will set STAGE_SVCCLASS to %s' % castorConfig['svcClassDefault'])
serviceClass = castorConfig['svcClassDefault']
tolog('Setting STAGE_SVCCLASS to %s' % serviceClass)
os.environ['STAGE_SVCCLASS'] = serviceClass
dest_path = os.path.join(path, lfn)
_cmd_str = '%s/usr/bin/rfcp %s %s' % (_setup_str, loc_pfn, dest_path)
tolog("Executing command: %s" % (_cmd_str))
report['transferStart'] = time()
# execute
timeout = 3600
try:
s, telapsed, cout, cerr = timed_command(_cmd_str, timeout)
except Exception, e:
pilotErrorDiag = 'timed_command() threw an exception: %s' % (e)
tolog("!!WARNING!!1111!! %s" % (pilotErrorDiag))
s = 1
o = str(e)
telapsed = timeout
else:
# improve output parsing, keep stderr and stdout separate
o = cout + cerr
tolog("Elapsed time: %d" % (telapsed))
report['validateStart'] = time()
if s != 0:
o = o.replace('\n', ' ')
pilotErrorDiag = "rfcp failed: %d, %s" % (s, o)
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
check_syserr(s, o)
# remove the local file before any get retry is attempted
_status = self.removeLocal(dest_path)
if not _status:
tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")
ec = error.ERR_STAGEINFAILED
if o.find("No such file or directory") >= 0:
if loc_pfn.find("DBRelease") >= 0:
pilotErrorDiag = "Missing DBRelease file: %s" % (loc_pfn)
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
ec = error.ERR_MISSDBREL
else:
pilotErrorDiag = "No such file or directory: %s" % (loc_pfn)
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
ec = error.ERR_NOSUCHFILE
self.prepareReport('RFCP_FAIL', report)
elif is_timeout(s):
pilotErrorDiag = "rfcp get was timed out after %d seconds" % (telapsed)
tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
self.prepareReport('GET_TIMEOUT', report)
ec = error.ERR_GETTIMEOUT
return ec, pilotErrorDiag
else:
tolog("Copy command finished")
if fsize == 0:
try:
fsize = str(os.path.getsize(loc_pfn))
except OSError, e:
pilotErrorDiag = "Could not get file size: %s" % str(e)
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
self.prepareReport('FS_FAIL', report)
# remove the local file before any get retry is attempted
_status = self.removeLocal(dest_path)
if not _status:
tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")
return error.ERR_FAILEDSIZELOCAL, pilotErrorDiag
loc_filename = lfn
dest_file = os.path.join(path, loc_filename)
# get the checksum type (md5sum or adler32)
if fchecksum != 0 and fchecksum != "":
csumtype = self.getChecksumType(fchecksum)
else:
csumtype = "default"
# get remote file size and checksum
ec, pilotErrorDiag, dstfsize, dstfchecksum = self.getLocalFileInfo(dest_file, csumtype=csumtype)
if ec != 0:
self.prepareReport('LOCAL_FILE_INFO_FAIL', report)
# remove the local file before any get retry is attempted
_status = self.removeLocal(dest_path)
if not _status:
tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")
return ec, pilotErrorDiag
# get remote file size and checksum
if dstfsize != fsize:
pilotErrorDiag = "Remote and local file sizes do not match for %s (%s != %s)" %\
(os.path.basename(gpfn), str(dstfsize), str(fsize))
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
self.prepareReport('FS_MISMATCH', report)
# remove the local file before any get retry is attempted
_status = self.removeLocal(dest_path)
if not _status:
tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")
return error.ERR_GETWRONGSIZE, pilotErrorDiag
# compare remote and local file checksum
if fchecksum != 0 and dstfchecksum != fchecksum and not self.isDummyChecksum(fchecksum):
pilotErrorDiag = "Remote and local checksums (of type %s) do not match for %s (%s != %s)" %\
(csumtype, os.path.basename(gpfn), dstfchecksum, fchecksum)
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
# remove the local file before any get retry is attempted
_status = self.removeLocal(dest_path)
if not _status:
tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")
if csumtype == "adler32":
self.prepareReport('AD_MISMATCH', report)
return error.ERR_GETADMISMATCH, pilotErrorDiag
else:
self.prepareReport('MD5_MISMATCH', report)
return error.ERR_GETMD5MISMATCH, pilotErrorDiag
updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input")
self.prepareReport('DONE', report)
return 0, pilotErrorDiag
def put_data(self, source, ddm_storage, fsize=0, fchecksum=0, dsname='', **pdict):
""" Data transfer using rfcp - generic version
It's not advisable to use this right now because there's no
easy way to register the srm space token if the file is
copied with rfcp """
error = PilotErrors()
# Get input parameters from pdict
lfn = pdict.get('lfn', '')
guid = pdict.get('guid', '')
# get the Rucio tracing report
report = self.getStubTracingReport(pdict['report'], 'castorSVC', lfn, guid)
pilotErrorDiag = "put_data does not work for this mover"
tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
self.prepareReport('NOT_IMPL', report)
return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)