#!/usr/bin/env python
import sys, os, string, re
import urllib
###########################################################################
def unserialize(data):
return intern_unserialize(data, 0)[2]
###########################################################################
def intern_unserialize(data, offset=0):
"""
Find the next token and unserialize it.
Recurse on array.
offset = raw offset from start of data
"""
buf = []
dtype = string.lower(data[offset:offset+1])
# 't:' = 2 chars
dataoffset = offset + 2
typeconvert = lambda x : x
chars = datalength = 0
# int => Integer
if dtype == 'i':
typeconvert = lambda x : int(x)
(chars, readdata) = read_until(data, dataoffset, ';')
# +1 for end semicolon
dataoffset += chars + 1
elif dtype == 'o':
(chars, keys) = read_until(data, dataoffset, ':')
dataoffset += chars + 1
(chars, keys) = read_until(data, dataoffset, ':')
dataoffset += chars + 1
(chars, keys) = read_until(data, dataoffset, ':')
#print chars,keys
dataoffset += chars + 2
readdata = {}
for i in range(0, int(keys)):
(ktype, kchars, key) = intern_unserialize(data, dataoffset)
dataoffset += kchars
# print "Key(%i) = (%s, %i, %s) %i" % (i, ktype, kchars, key, dataoffset)
# Read value of the key
(vtype, vchars, value) = intern_unserialize(data, dataoffset)
dataoffset += vchars
# print "Value(%i) = (%s, %i, %s) %i" % (i, vtype, vchars, value, dataoffset)
# Set the list element
readdata[key] = value
# +1 for end semicolon
dataoffset += chars - 1
# bool => Boolean
elif dtype == 'b':
typeconvert = lambda x : (int(x) == 1)
(chars, readdata) = read_until(data, dataoffset, ';')
# +1 for end semicolon
dataoffset += chars + 1
# double => Floating Point
elif dtype == 'd':
typeconvert = lambda x : float(x)
(chars, readdata) = read_until(data, dataoffset, ';')
# +1 for end semicolon
dataoffset += chars + 1
# n => None
elif dtype == 'n':
readdata = None
# s => String
elif dtype == 's':
(chars, stringlength) = read_until(data, dataoffset, ':')
# +2 for colons around length field
dataoffset += chars + 2
# +1 for start quote
(chars, readdata) = read_chars(data, dataoffset+1, int(stringlength))
# +2 for endquote semicolon
dataoffset += chars + 2
if chars != int(stringlength) != int(readdata):
raise Exception("String length mismatch")
# array => Dict
# If you originally serialized a Tuple or List, it will
# be unserialized as a Dict. PHP doesn't have tuples or lists,
# only arrays - so everything has to get converted into an array
# when serializing and the original type of the array is lost
elif dtype == 'a':
readdata = {}
# How many keys does this list have?
(chars, keys) = read_until(data, dataoffset, ':')
# +2 for colons around length field
dataoffset += chars + 2
# Loop through and fetch this number of key/value pairs
for i in range(0, int(keys)):
# Read the key
(ktype, kchars, key) = intern_unserialize(data, dataoffset)
dataoffset += kchars
#print "Key(%i) = (%s, %i, %s) %i" % (i, ktype, kchars, key, dataoffset)
# Read value of the key
(vtype, vchars, value) = intern_unserialize(data, dataoffset)
dataoffset += vchars
#print "Value(%i) = (%s, %i, %s) %i" % (i, vtype, vchars, value, dataoffset)
# Set the list element
readdata[key] = value
# +1 for end semicolon
dataoffset += 1
#chars = int(dataoffset) - start
# I don't know how to unserialize this
else:
raise Exception("Unknown / Unhandled data type (%s)!" % dtype)
return (dtype, dataoffset-offset, typeconvert(readdata))
###########################################################################
def read_until(data, offset, stopchar):
"""
Read from data[offset] until you encounter some char 'stopchar'.
"""
buf = []
char = data[offset:offset+1]
i = 2
while char != stopchar:
# Consumed all the characters and havent found ';'
if i+offset > len(data):
raise Exception("Invalid")
buf.append(char)
char = data[offset+(i-1):offset+i]
i += 1
# (chars_read, data)
return (len(buf), "".join(buf))
###########################################################################
def read_chars(data, offset, length):
"""
Read 'length' number of chars from data[offset].
"""
buf = []
# Account for the starting quote char
#offset += 1
for i in range(0, length):
char = data[offset+(i-1):offset+i]
buf.append(char)
# (chars_read, data)
return (len(buf), "".join(buf))
###########################################################################
def parseOptions(argv):
"""
Parses command-line options.
Returns a dictionary with specified options as keys:
-opt1 --> 'opt1' : None
-opt2 val --> 'opt2' : 'val'
-opt3=val --> 'opt3' : 'val'
Usually called as
options = parseOptions(sys.argv[1:])
"""
options = {}
argc = len(argv)
i = 0
while ( i < argc ):
if argv[i][0] != '-':
i = i + 1
continue
eq = string.find(argv[i], '=')
if eq > 0 :
opt = argv[i][:eq]
val = argv[i][eq+1:]
pass
else:
opt = argv[i]
val = None
if ( i+1 < argc and argv[i+1][0] != '-' ):
i = i + 1
val = argv[i]
pass
pass
options[opt] = val
i = i + 1
pass
return options
###########################################################################
def GetRefDBInfo(owner,dataset):
RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
try:
f = urllib.urlopen(RefDBurl_+RefDBMotherphp_+'?format=serialized&owner='+owner+'&dataset='+dataset)
except IOError:
print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n'
data = f.read()
if len(data)>0:
if data[0]=='<':
if (data.find("down") > -1) :
print "\n WARNING: RefDB is temporarily down for a short maintenace \n"
print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n'
else:
print '\nERROR No Collection Owner/Dataset found in RefDB : '+owner+'/'+dataset+'\n'
try:
collections = unserialize(data)
except IOError:
print '\nERROR Unserializing: '+data+'\n'
collinfos=[]
try:
for k in collections.keys():
collinfos.append([collections[k]['id'],collections[k]['name'],collections[k]['type'],collections[k]['oname'],collections[k]['dname']])
except IOError:
print 'ERROR accessing PHP: ',data,'isn\'t updated version \n'
return collinfos
###########################################################################
def findPubDBbyCollID(CollID):
"""
Find PubDB URLs having a given Collection
"""
### contact the RefDB-PubDBs map to discovery where the given CollID is
PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.php?display=1'
url = PubDBCentralUrl_+RefDBPubDBsmapPhp_+'&CollID=' + CollID
# print url
try:
f = urllib.urlopen(url)
except IOError:
print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
### search for the PubDBURL string
reURLLine=re.compile( r'PubDBURL=(\S*)' )
PubDBURLs = []
for line in f.readlines():
if reURLLine.search(line) :
URLLine=reURLLine.search(line).group()
PubDBURLs.append(string.split(URLLine,'=')[1])
### return PubDBURLs where the collection is present
return uniquelist(PubDBURLs)
#######################################################################
def uniquelist(old):
"""
remove duplicates from a list
"""
nd={}
for e in old:
nd[e]=0
return nd.keys()
########################################################################
def findAllCollections(owner,dataset,data_tier,verbose):
"""
Contact RefDB and find the CollID of all the user required collections
"""
try:
collInfos=GetRefDBInfo(owner,dataset)
except :
sys.exit(10)
first=1
NeededCollID=[]
refdbdataTiers=[]
NeededdataTiers=[]
for coll in collInfos:
## select the primary collection
if first:
NeededCollID.append(coll[0])
NeededdataTiers.append(coll[2])
refdbdataTiers.append(coll[2])
if ( verbose == '1' ) :
print "\n --> primary collection for owner ",owner," is: ID=",coll[0]," DataTier=",coll[2]
first=0
else:
## select only the parents collections corresponding to data-tiers requested by the user
if data_tier.count(coll[2]):
NeededCollID.append(coll[0])
NeededdataTiers.append(coll[2])
if ( verbose == '1' ) :
print " --> further collection required: ID=",coll[0]," DataTier=",coll[2]
refdbdataTiers.append(coll[2])
## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
for dt in data_tier:
if refdbdataTiers.count(dt)<=0:
print ""
print "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner ", dataset,"/",owner,"!"
print "Check the data_tier variable"
print 'Owner Dataset not published with asked dataTiers! ',\
owner,' ', dataset,' ',data_tier
print ""
sys.exit(1)
return NeededCollID
########################################################################
def findPubDBs(CollIDs):
"""
Find the list of PubDB URLs having ALL the required collections
"""
### loop over all the required collections
allurls=[]
countColl=0
for CollID in CollIDs :
countColl=countColl+1
### prepare a list all PubDB urls for all collections
allurls.extend(findPubDBsbyCollID(CollID))
### select only PubDB urls that contains all the collections
unique_urls=uniquelist(allurls)
SelectedPubDBURLs=[]
# loop on a unique list of PubDB urls
for url in unique_urls :
# check that PubDBurl occurrance is the same as the number of collections
if ( allurls.count(url)==countColl ) :
tmp = url.find('fnal.gov')
if ( tmp != -1 ) :
SelectedPubDBURLs.append(url)
return SelectedPubDBURLs
#######################################################################
def findPubDBsbyCollID(CollID):
"""
Find the list of PubDB URLs having a given Collection
"""
PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.php?display=1'
### contact the RefDB-PubDBs map to discovery where the given CollID is
url = PubDBCentralUrl_+RefDBPubDBsmapPhp_+'&CollID=' + CollID
try:
f = urllib.urlopen(url)
except IOError:
print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
### search for the PubDBURL string
reURLLine=re.compile( r'PubDBURL=(\S*)' )
PubDBURLs = []
for line in f.readlines():
if reURLLine.search(line) :
URLLine=reURLLine.search(line).group()
PubDBURLs.append(string.split(URLLine,'=')[1])
### return the list of PubDBURL where the collection is present
return uniquelist(PubDBURLs)
########################################################################
def getPubDBData(CollIDs,url,data_tier):
"""
Contact a PubDB to collect all the relevant information
"""
result = []
### get the base PubDb url
end=string.rfind(url,'/')
lastEq=string.rfind(url,'=')
### (get info for all the collections in one shot and unserialize the content)
Collections=string.join(CollIDs,'-')
## add the PU among the required Collections if the Digi are requested
# ( asking it directly to the PubDB so the RefDB level data discovery is
# bypassed..... in future when everybody is move to newstyle it
# will be possible to ask for PU , at RefDB level, in method findAllCollections )
#
if ( data_tier.count('Digi') ):
PUCollID=getDatatierCollID(url[:end+1],Collections,"PU")
if (PUCollID) : CollIDs.append(PUCollID)
##
Collections=string.join(CollIDs,'-')
ok=0
try:
catInfos=GetPubDBInfo(url[:end+1],Collections)
ok=1
except :
print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n'
if (ok): result=catInfos;
return result
########################################################################
def getDatatierCollID(urlbase,CollIDString,datatier):
"""
Contact a script of PubDB to retrieve the colldi a DataTier
"""
try:
f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
except IOError:
print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'. The PU might not be published at that site.\n'
data = f.read()
colldata=re.compile(r'collid=(\S*)').search(data);
if colldata:
datatier_CollID=colldata.group(1)
# print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
return datatier_CollID
########################################################################
def GetPubDBInfo(pubdburl,Collections):
"""
Extract all the information from the PubDB analysis interface
"""
try:
f = urllib.urlopen(pubdburl+'pubdb-get-analysisinfo.php?collid='+Collections)
except IOError:
print '\nERROR accessing PubDB for Collections: '+Collections+'\n'
data = f.read()
if len(data)>0:
if data[0]=='<':
print '\nERROR No Collections found in PubDB : '+Collections+'\n'
try:
catalogues = unserialize(data)
except IOError:
print '\nERROR Unserializing: '+data+'\n'
try:
colllist=[]
for k in catalogues.keys():
## get collection ID and type
CollId=catalogues[k]['CollectionId']
CollType=catalogues[k]['CollectionType']
## set primary collection flag
PrimaryCollFlag=0
PrimaryCollID=string.split(Collections,'-')[0]
if ( CollId == PrimaryCollID ) : PrimaryCollFlag=1
cat=catalogues[k]['Catalogue']
for kcat in cat.keys():
ContactString=cat[kcat]['ContactString']
colllist.append(ContactString)
except IOError:
print '\nERROR Unserializing: '+data+'\n'
return colllist
########################################################################
if __name__ == '__main__':
# parse command line options
options = parseOptions(sys.argv[1:])
# initialize variables
owner = 'blank'
dataset = 'blank'
pubdb = 'blank'
verbose = '0'
help = '0'
format_html=''
# What follows is a default for FNAL
cat_path ='xmlcatalog_file:/uscms/METADATA'
data_tier = []
# fill variables from command line
for opt in options.keys():
if ( opt == '-owner' ):
owner = options[opt]
if ( opt == '-dataset' ):
dataset = options[opt]
if ( opt == '-verbose' or opt == '-v'):
verbose = '1'
if ( opt == '-help' or opt == '-h'):
help = '1'
if ( opt == '-cat_path' ) :
cat_path = options[opt]
if ( opt == '-html' ) :
format_html = '
'
if ( opt == '-pubdb' ) :
pubdb = options[opt]
if ( opt == '-data_tier' ):
tmpDataTiers = string.split(options[opt],',')
for tmp in tmpDataTiers:
tmp=string.strip(tmp)
data_tier.append(tmp)
# print cat_path
if (len(data_tier) == 0) :
dtn=string.count(owner,'Hit')
if (dtn > 0) :
tmp=string.strip('Hit')
data_tier.append(tmp)
else :
dtn=string.count(owner,'DST')
if (dtn > 0) :
tmp=string.strip('DST')
data_tier.append(tmp)
tmp=string.strip('Digi')
data_tier.append(tmp)
tmp=string.strip('Hit')
data_tier.append(tmp)
else :
tmp=string.strip('Digi')
data_tier.append(tmp)
tmp=string.strip('Hit')
data_tier.append(tmp)
# error output or usage output
if ( owner == 'blank' or dataset == 'blank' ) :
print 'extract.py usage:'
print ''
print '-owner : (requested)'
print '-dataset : (requested)'
print '-data_tier : if not specified, maximal data_tier derived from dataset/owner (optional) '
print '-pubdb : if not specified, frist from pubdb discovery (add complete path and trailing / ) (optional)'
print '-cat_path : default=xmlcatalog_file:/uscms/METADATA for FNAL (optional)'
print '-format_html : html format (optional)'
print '-verbose / -v : verbose output (optional)'
print '-help / -h : this message (optional)'
print ''
sys.exit(0)
else :
if ( verbose == '1' ) :
print 'Running mode'+format_html
print ''+format_html
print 'owner = '+owner+format_html
print 'dataset = '+dataset+format_html
print 'data_tier = ',data_tier,format_html
if ( pubdb != 'blank' ) :
print 'pubdb = ',pubdb,format_html
print ''+format_html
### find the user-required collection IDs
CollIDs = findAllCollections(owner,dataset,data_tier,verbose)
### find the PubDB URLs publishing the needed data
if ( pubdb == 'blank' ) :
urllist = findPubDBs(CollIDs)
else :
urllist = []
urllist.append(pubdb)
if ( len(urllist) > 0 ) :
### collect information for first pubdb
res=getPubDBData(CollIDs,urllist[0],data_tier)
### make unique list
res=uniquelist(res)
print '# These entries were pulled from from PubDB:',urllist[0],format_html
print 'InputFileCatalogURL=@{'+format_html
for result in res :
ress=string.split(result,'8080')
if ( len(ress) > 1 ) :
print cat_path+ress[1]+format_html
print '}@'+format_html
else :
print ''+format_html
print ''+format_html
print '# The Requested Dataset is not available '+format_html
print '# On the pubDB: ',urllist[0],format_html
print ''+format_html