#!/usr/bin/python import sys, os, getopt, urllib def extract_local(site_para,debug) : local_url_dictionary = {} local_url_dictionary['NEBR'] = 'http://phedex.unl.edu/PubDB/pubdb-get-summary.php' local_url_dictionary['WISC'] = 'http://www.hep.wisc.edu/cms/comp/PubDB/pubdb-get-summary.php' local_url_dictionary['UFL'] = 'http://tier2.phys.ufl.edu/PubDB/www/pubdb-get-summary.php' local_url_dictionary['PURDUE'] = 'http://phedex.rcac.purdue.edu/pubdb-get-summary.php' local_url_dictionary['UCSD'] = 'http://tier2.ucsd.edu/cms/PubDB/www-PubDB/pubdb-get-summary.php' local_url_dictionary['CALT'] = 'http://cithep66.ultralight.org/PubDB-Prod/www/pubdb-get-summary.php' #local_url_dictionary['NEBR'] = 'nebr-pubdb-get-summary.php.html' #local_url_dictionary['WISC'] = 'wisc-pubdb-get-summary.php.html' #local_url_dictionary['UFL'] = 'ufl-pubdb-get-summary.php.html' #local_url_dictionary['PURDUE'] = 'purdue-pubdb-get-summary.php.html' #local_url_dictionary['UCSD'] = 'ucsd-pubdb-get-summary.php.html' #local_url_dictionary['CALT'] = 'caltech-pubdb-get-summary.php.html' local_url = local_url_dictionary[site_para] local_page = urllib.urlopen(local_url) # local dictionary local_dataset_dict = {} # local_page = open(local_url) line = local_page.readline() counter = 0 while line : line = line.replace('> <','><') if line.find("") > -1 : row_array = line.split("") for row in row_array : row = row.replace("",'') if row.find("") > -1 : counter += 1 if debug == 2 : print 'row:',counter,row column_counter = 0 column_array = row.split("") dataset = '' owner = '' events = 0 for column in column_array : column = column.replace("","") column = column.replace("","") if column != '' : column_counter += 1 if debug == 2 : print 'column:',column_counter,column if column_counter == 1 : dataset = column if column_counter == 2 : owner = column if column_counter == 3 : events = int(column) if column_counter > 3 : print 'LOCAL ERROR: More elements than expected in row',row if dataset == '' or owner == '' or events == 0 : print 'LOCAL ERROR: parsing into dataset,owner,events problen in row',row else : if dataset in local_dataset_dict.keys() : local_owner_dict = local_dataset_dict[dataset] if owner in local_owner_dict.keys(): print 'LOCAL ERROR: owner:',owner,' entry already existing for dataset:',dataset,'from row:',row else: local_owner_dict[owner] = events else : local_owner_dict = {} local_owner_dict[owner] = events local_dataset_dict[dataset] = local_owner_dict line = local_page.readline() # print local dataset if debug == 1 : for dataset in local_dataset_dict.keys() : local_owner_dict = local_dataset_dict[dataset] for owner in local_owner_dict.keys() : events = local_owner_dict[owner] print 'LOCAL DICT: dataset: %-40s owner: %-40s events: %10d' % (dataset,owner,events) return local_dataset_dict def extract_global(site_para,debug) : global_url = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/GetPublishedCollectionInfoFromRefDB.mod.php' global_page = urllib.urlopen(global_url) # global dictionary global_dataset_dict = {} # global_page = open('publication_complete.html') # global_page = open('publication.html') line = global_page.readline() row_counter = 0 while line : line = line.strip() if line.startswith("") or line.startswith("") : row_counter += 1 dataset_line = global_page.readline() dataset_line = dataset_line.strip() dataset_line = dataset_line.replace("","") dataset_line = dataset_line.replace("","") dataset_line_array = dataset_line.split(">") dataset = dataset_line_array[-1] owner_line = global_page.readline() owner_line = owner_line.strip() owner_line = owner_line.replace("","") owner_line = owner_line.replace("","") owner_line_array = owner_line.split(">") owner = owner_line_array[-1] events_line = global_page.readline() events_line = events_line.strip() site_line = events_line events_line = events_line.replace("","") events_line_array = events_line.split(">") events_line_array = events_line_array[0].split("<") events = int(events_line_array[0]) site_line = site_line.replace("HTTP","") site_line_array = site_line.split(">") site = site_line_array[-1] if debug == 2 : print row_counter print 'dataset:',dataset print 'owner:',owner print 'events:',events print 'site:',site, if site == site_para : if dataset in global_dataset_dict.keys() : global_owner_dict = global_dataset_dict[dataset] if owner in global_owner_dict.keys(): print 'GLOBAL ERROR: owner:',owner,' entry already existing for dataset:',dataset,'from row:',row else: global_owner_dict[owner] = events else : global_owner_dict = {} global_owner_dict[owner] = events global_dataset_dict[dataset] = global_owner_dict line = global_page.readline() # print global dataset if debug == 1 : for dataset in global_dataset_dict.keys() : global_owner_dict = global_dataset_dict[dataset] for owner in global_owner_dict.keys() : events = global_owner_dict[owner] print 'GLOBAL DICT: dataset: %-40s owner: %-40s events: %10d' % (dataset,owner,events) return global_dataset_dict def main(argv) : """ compare_publication compare publication between local and global services required parameters: --site : site: NEBR,WISC,UFL,PURDUE,UCSD,CALT optional parameters: --global : url of global catalog (default: http://cmsdoc.cern.ch/cms/production/www/PubDB/GetPublishedCollectionInfoFromRefDB.mod.php) --help (-h) : help --debug (-d) : debug statements """ # defaults debug = 0 site_para = '' try: opts, args = getopt.getopt(argv, "", ["help", "debug", "site="]) except getopt.GetoptError: print main.__doc__ sys.exit(2) # check command line parameter for opt, arg in opts : if opt == "--help" : print main.__doc__ sys.exit() elif opt == "--site" : site_para = arg elif opt == "--global" : global_url = arg elif opt == "--debug" : debug = 1 if site_para == '' : print main.__doc__ sys.exit() if site_para != 'NEBR' and site_para != 'WISC' and site_para != 'UFL' and site_para != 'PURDUE' and site_para != 'UCSD' and site_para != 'CALT' : print main.__doc__ sys.exit() print '' print ' Compare global and local published datasets for site:',site_para print '' local_dict = extract_local(site_para,debug) global_dict = extract_global(site_para,debug) # local entries dataset_counter = 0 owner_counter = 0 for dataset in local_dict.keys() : dataset_counter += 1 owner_dict = local_dict[dataset] for owner in owner_dict.keys() : owner_counter += 1 print ' Site: %-7s publishes LOCALLY %5d datasets and %5d dataset,owner combinations' % (site_para,dataset_counter,owner_counter) # global entries dataset_counter = 0 owner_counter = 0 for dataset in global_dict.keys() : dataset_counter += 1 owner_dict = global_dict[dataset] for owner in owner_dict.keys() : owner_counter += 1 print ' Site: %-7s publishes GLOBALLY %5d datasets and %5d dataset,owner combinations' % (site_para,dataset_counter,owner_counter) # compare datasets for dataset_local in local_dict.keys() : owner_local_dict = local_dict[dataset_local] if dataset_local not in global_dict.keys() : print ' Dataset: %-40s %-40s from LOCAL service not found in GLOBAL service' % (dataset_local,'') else : owner_global_dict = global_dict[dataset_local] for owner_local in owner_local_dict.keys() : if owner_local not in owner_global_dict.keys(): print ' Dataset: %-40s Owner: %-40s from LOCAL service not found in GLOBAL service' % (dataset_local,owner_local) for dataset_global in global_dict.keys() : owner_global_dict = global_dict[dataset_global] if dataset_global not in local_dict.keys() : print ' Dataset: %-40s %-40s from GLOBAL service not found in LOCAL service' % (dataset_global,'') else : owner_local_dict = local_dict[dataset_global] for owner_global in owner_global_dict.keys() : if owner_global not in owner_local_dict.keys(): print ' Dataset: %-40s Owner: %-40s from GLOBAL service not found in LOCAL service' % (dataset_global,owner_global) if __name__ == '__main__' : main(sys.argv[1:])