#!/usr/bin/env python import sys, os, string, re import urllib ########################################################################### def unserialize(data): return intern_unserialize(data, 0)[2] ########################################################################### def intern_unserialize(data, offset=0): """ Find the next token and unserialize it. Recurse on array. offset = raw offset from start of data """ buf = [] dtype = string.lower(data[offset:offset+1]) # 't:' = 2 chars dataoffset = offset + 2 typeconvert = lambda x : x chars = datalength = 0 # int => Integer if dtype == 'i': typeconvert = lambda x : int(x) (chars, readdata) = read_until(data, dataoffset, ';') # +1 for end semicolon dataoffset += chars + 1 elif dtype == 'o': (chars, keys) = read_until(data, dataoffset, ':') dataoffset += chars + 1 (chars, keys) = read_until(data, dataoffset, ':') dataoffset += chars + 1 (chars, keys) = read_until(data, dataoffset, ':') #print chars,keys dataoffset += chars + 2 readdata = {} for i in range(0, int(keys)): (ktype, kchars, key) = intern_unserialize(data, dataoffset) dataoffset += kchars # print "Key(%i) = (%s, %i, %s) %i" % (i, ktype, kchars, key, dataoffset) # Read value of the key (vtype, vchars, value) = intern_unserialize(data, dataoffset) dataoffset += vchars # print "Value(%i) = (%s, %i, %s) %i" % (i, vtype, vchars, value, dataoffset) # Set the list element readdata[key] = value # +1 for end semicolon dataoffset += chars - 1 # bool => Boolean elif dtype == 'b': typeconvert = lambda x : (int(x) == 1) (chars, readdata) = read_until(data, dataoffset, ';') # +1 for end semicolon dataoffset += chars + 1 # double => Floating Point elif dtype == 'd': typeconvert = lambda x : float(x) (chars, readdata) = read_until(data, dataoffset, ';') # +1 for end semicolon dataoffset += chars + 1 # n => None elif dtype == 'n': readdata = None # s => String elif dtype == 's': (chars, stringlength) = read_until(data, dataoffset, ':') # +2 for colons around length field dataoffset += chars + 2 # +1 for start quote (chars, readdata) = read_chars(data, dataoffset+1, int(stringlength)) # +2 for endquote semicolon dataoffset += chars + 2 if chars != int(stringlength) != int(readdata): raise Exception("String length mismatch") # array => Dict # If you originally serialized a Tuple or List, it will # be unserialized as a Dict. PHP doesn't have tuples or lists, # only arrays - so everything has to get converted into an array # when serializing and the original type of the array is lost elif dtype == 'a': readdata = {} # How many keys does this list have? (chars, keys) = read_until(data, dataoffset, ':') # +2 for colons around length field dataoffset += chars + 2 # Loop through and fetch this number of key/value pairs for i in range(0, int(keys)): # Read the key (ktype, kchars, key) = intern_unserialize(data, dataoffset) dataoffset += kchars #print "Key(%i) = (%s, %i, %s) %i" % (i, ktype, kchars, key, dataoffset) # Read value of the key (vtype, vchars, value) = intern_unserialize(data, dataoffset) dataoffset += vchars #print "Value(%i) = (%s, %i, %s) %i" % (i, vtype, vchars, value, dataoffset) # Set the list element readdata[key] = value # +1 for end semicolon dataoffset += 1 #chars = int(dataoffset) - start # I don't know how to unserialize this else: raise Exception("Unknown / Unhandled data type (%s)!" % dtype) return (dtype, dataoffset-offset, typeconvert(readdata)) ########################################################################### def read_until(data, offset, stopchar): """ Read from data[offset] until you encounter some char 'stopchar'. """ buf = [] char = data[offset:offset+1] i = 2 while char != stopchar: # Consumed all the characters and havent found ';' if i+offset > len(data): raise Exception("Invalid") buf.append(char) char = data[offset+(i-1):offset+i] i += 1 # (chars_read, data) return (len(buf), "".join(buf)) ########################################################################### def read_chars(data, offset, length): """ Read 'length' number of chars from data[offset]. """ buf = [] # Account for the starting quote char #offset += 1 for i in range(0, length): char = data[offset+(i-1):offset+i] buf.append(char) # (chars_read, data) return (len(buf), "".join(buf)) ########################################################################### def parseOptions(argv): """ Parses command-line options. Returns a dictionary with specified options as keys: -opt1 --> 'opt1' : None -opt2 val --> 'opt2' : 'val' -opt3=val --> 'opt3' : 'val' Usually called as options = parseOptions(sys.argv[1:]) """ options = {} argc = len(argv) i = 0 while ( i < argc ): if argv[i][0] != '-': i = i + 1 continue eq = string.find(argv[i], '=') if eq > 0 : opt = argv[i][:eq] val = argv[i][eq+1:] pass else: opt = argv[i] val = None if ( i+1 < argc and argv[i+1][0] != '-' ): i = i + 1 val = argv[i] pass pass options[opt] = val i = i + 1 pass return options ########################################################################### def GetRefDBInfo(owner,dataset): RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/' RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php' try: f = urllib.urlopen(RefDBurl_+RefDBMotherphp_+'?format=serialized&owner='+owner+'&dataset='+dataset) except IOError: print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n' data = f.read() if len(data)>0: if data[0]=='<': if (data.find("down") > -1) : print "\n WARNING: RefDB is temporarily down for a short maintenace \n" print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n' else: print '\nERROR No Collection Owner/Dataset found in RefDB : '+owner+'/'+dataset+'\n' try: collections = unserialize(data) except IOError: print '\nERROR Unserializing: '+data+'\n' collinfos=[] try: for k in collections.keys(): collinfos.append([collections[k]['id'],collections[k]['name'],collections[k]['type'],collections[k]['oname'],collections[k]['dname']]) except IOError: print 'ERROR accessing PHP: ',data,'isn\'t updated version \n' return collinfos ########################################################################### def findPubDBbyCollID(CollID): """ Find PubDB URLs having a given Collection """ ### contact the RefDB-PubDBs map to discovery where the given CollID is PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/' RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.php?display=1' url = PubDBCentralUrl_+RefDBPubDBsmapPhp_+'&CollID=' + CollID # print url try: f = urllib.urlopen(url) except IOError: print '\nERROR accessing RefDB-PubDBs map at '+url+'\n' ### search for the PubDBURL string reURLLine=re.compile( r'PubDBURL=(\S*)' ) PubDBURLs = [] for line in f.readlines(): if reURLLine.search(line) : URLLine=reURLLine.search(line).group() PubDBURLs.append(string.split(URLLine,'=')[1]) ### return PubDBURLs where the collection is present return uniquelist(PubDBURLs) ####################################################################### def uniquelist(old): """ remove duplicates from a list """ nd={} for e in old: nd[e]=0 return nd.keys() ######################################################################## def findAllCollections(owner,dataset,data_tier,verbose): """ Contact RefDB and find the CollID of all the user required collections """ try: collInfos=GetRefDBInfo(owner,dataset) except : sys.exit(10) first=1 NeededCollID=[] refdbdataTiers=[] NeededdataTiers=[] for coll in collInfos: ## select the primary collection if first: NeededCollID.append(coll[0]) NeededdataTiers.append(coll[2]) refdbdataTiers.append(coll[2]) if ( verbose == '1' ) : print "\n --> primary collection for owner ",owner," is: ID=",coll[0]," DataTier=",coll[2] first=0 else: ## select only the parents collections corresponding to data-tiers requested by the user if data_tier.count(coll[2]): NeededCollID.append(coll[0]) NeededdataTiers.append(coll[2]) if ( verbose == '1' ) : print " --> further collection required: ID=",coll[0]," DataTier=",coll[2] refdbdataTiers.append(coll[2]) ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message for dt in data_tier: if refdbdataTiers.count(dt)<=0: print "" print "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner ", dataset,"/",owner,"!" print "Check the data_tier variable" print 'Owner Dataset not published with asked dataTiers! ',\ owner,' ', dataset,' ',data_tier print "" sys.exit(1) return NeededCollID ######################################################################## def findPubDBs(CollIDs): """ Find the list of PubDB URLs having ALL the required collections """ ### loop over all the required collections allurls=[] countColl=0 for CollID in CollIDs : countColl=countColl+1 ### prepare a list all PubDB urls for all collections allurls.extend(findPubDBsbyCollID(CollID)) ### select only PubDB urls that contains all the collections unique_urls=uniquelist(allurls) SelectedPubDBURLs=[] # loop on a unique list of PubDB urls for url in unique_urls : # check that PubDBurl occurrance is the same as the number of collections if ( allurls.count(url)==countColl ) : tmp = url.find('fnal.gov') if ( tmp != -1 ) : SelectedPubDBURLs.append(url) return SelectedPubDBURLs ####################################################################### def findPubDBsbyCollID(CollID): """ Find the list of PubDB URLs having a given Collection """ PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/' RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.php?display=1' ### contact the RefDB-PubDBs map to discovery where the given CollID is url = PubDBCentralUrl_+RefDBPubDBsmapPhp_+'&CollID=' + CollID try: f = urllib.urlopen(url) except IOError: print '\nERROR accessing RefDB-PubDBs map at '+url+'\n' ### search for the PubDBURL string reURLLine=re.compile( r'PubDBURL=(\S*)' ) PubDBURLs = [] for line in f.readlines(): if reURLLine.search(line) : URLLine=reURLLine.search(line).group() PubDBURLs.append(string.split(URLLine,'=')[1]) ### return the list of PubDBURL where the collection is present return uniquelist(PubDBURLs) ######################################################################## def getPubDBData(CollIDs,url,data_tier): """ Contact a PubDB to collect all the relevant information """ result = [] ### get the base PubDb url end=string.rfind(url,'/') lastEq=string.rfind(url,'=') ### (get info for all the collections in one shot and unserialize the content) Collections=string.join(CollIDs,'-') ## add the PU among the required Collections if the Digi are requested # ( asking it directly to the PubDB so the RefDB level data discovery is # bypassed..... in future when everybody is move to newstyle it # will be possible to ask for PU , at RefDB level, in method findAllCollections ) # if ( data_tier.count('Digi') ): PUCollID=getDatatierCollID(url[:end+1],Collections,"PU") if (PUCollID) : CollIDs.append(PUCollID) ## Collections=string.join(CollIDs,'-') ok=0 try: catInfos=GetPubDBInfo(url[:end+1],Collections) ok=1 except : print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n' if (ok): result=catInfos; return result ######################################################################## def getDatatierCollID(urlbase,CollIDString,datatier): """ Contact a script of PubDB to retrieve the colldi a DataTier """ try: f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier) except IOError: print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'. The PU might not be published at that site.\n' data = f.read() colldata=re.compile(r'collid=(\S*)').search(data); if colldata: datatier_CollID=colldata.group(1) # print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier return datatier_CollID ######################################################################## def GetPubDBInfo(pubdburl,Collections): """ Extract all the information from the PubDB analysis interface """ try: f = urllib.urlopen(pubdburl+'pubdb-get-analysisinfo.php?collid='+Collections) except IOError: print '\nERROR accessing PubDB for Collections: '+Collections+'\n' data = f.read() if len(data)>0: if data[0]=='<': print '\nERROR No Collections found in PubDB : '+Collections+'\n' try: catalogues = unserialize(data) except IOError: print '\nERROR Unserializing: '+data+'\n' try: colllist=[] for k in catalogues.keys(): ## get collection ID and type CollId=catalogues[k]['CollectionId'] CollType=catalogues[k]['CollectionType'] ## set primary collection flag PrimaryCollFlag=0 PrimaryCollID=string.split(Collections,'-')[0] if ( CollId == PrimaryCollID ) : PrimaryCollFlag=1 cat=catalogues[k]['Catalogue'] for kcat in cat.keys(): ContactString=cat[kcat]['ContactString'] colllist.append(ContactString) except IOError: print '\nERROR Unserializing: '+data+'\n' return colllist ######################################################################## if __name__ == '__main__': # parse command line options options = parseOptions(sys.argv[1:]) # initialize variables owner = 'blank' dataset = 'blank' pubdb = 'blank' verbose = '0' help = '0' format_html='' # What follows is a default for FNAL cat_path ='xmlcatalog_file:/uscms/METADATA' data_tier = [] # fill variables from command line for opt in options.keys(): if ( opt == '-owner' ): owner = options[opt] if ( opt == '-dataset' ): dataset = options[opt] if ( opt == '-verbose' or opt == '-v'): verbose = '1' if ( opt == '-help' or opt == '-h'): help = '1' if ( opt == '-cat_path' ) : cat_path = options[opt] if ( opt == '-html' ) : format_html = '
' if ( opt == '-pubdb' ) : pubdb = options[opt] if ( opt == '-data_tier' ): tmpDataTiers = string.split(options[opt],',') for tmp in tmpDataTiers: tmp=string.strip(tmp) data_tier.append(tmp) # print cat_path if (len(data_tier) == 0) : dtn=string.count(owner,'Hit') if (dtn > 0) : tmp=string.strip('Hit') data_tier.append(tmp) else : dtn=string.count(owner,'DST') if (dtn > 0) : tmp=string.strip('DST') data_tier.append(tmp) tmp=string.strip('Digi') data_tier.append(tmp) tmp=string.strip('Hit') data_tier.append(tmp) else : tmp=string.strip('Digi') data_tier.append(tmp) tmp=string.strip('Hit') data_tier.append(tmp) # error output or usage output if ( owner == 'blank' or dataset == 'blank' ) : print 'extract.py usage:' print '' print '-owner : (requested)' print '-dataset : (requested)' print '-data_tier : if not specified, maximal data_tier derived from dataset/owner (optional) ' print '-pubdb : if not specified, frist from pubdb discovery (add complete path and trailing / ) (optional)' print '-cat_path : default=xmlcatalog_file:/uscms/METADATA for FNAL (optional)' print '-format_html : html format (optional)' print '-verbose / -v : verbose output (optional)' print '-help / -h : this message (optional)' print '' sys.exit(0) else : if ( verbose == '1' ) : print 'Running mode'+format_html print ''+format_html print 'owner = '+owner+format_html print 'dataset = '+dataset+format_html print 'data_tier = ',data_tier,format_html if ( pubdb != 'blank' ) : print 'pubdb = ',pubdb,format_html print ''+format_html ### find the user-required collection IDs CollIDs = findAllCollections(owner,dataset,data_tier,verbose) ### find the PubDB URLs publishing the needed data if ( pubdb == 'blank' ) : urllist = findPubDBs(CollIDs) else : urllist = [] urllist.append(pubdb) if ( len(urllist) > 0 ) : ### collect information for first pubdb res=getPubDBData(CollIDs,urllist[0],data_tier) ### make unique list res=uniquelist(res) print '# These entries were pulled from from PubDB:',urllist[0],format_html print 'InputFileCatalogURL=@{'+format_html for result in res : ress=string.split(result,'8080') if ( len(ress) > 1 ) : print cat_path+ress[1]+format_html print '}@'+format_html else : print ''+format_html print ''+format_html print '# The Requested Dataset is not available '+format_html print '# On the pubDB: ',urllist[0],format_html print ''+format_html