#!/usr/bin/python
import sys, os, getopt, urllib
def extract_local(site_para,debug) :
local_url_dictionary = {}
local_url_dictionary['NEBR'] = 'http://phedex.unl.edu/PubDB/pubdb-get-summary.php'
local_url_dictionary['WISC'] = 'http://www.hep.wisc.edu/cms/comp/PubDB/pubdb-get-summary.php'
local_url_dictionary['UFL'] = 'http://tier2.phys.ufl.edu/PubDB/www/pubdb-get-summary.php'
local_url_dictionary['PURDUE'] = 'http://phedex.rcac.purdue.edu/pubdb-get-summary.php'
local_url_dictionary['UCSD'] = 'http://tier2.ucsd.edu/cms/PubDB/www-PubDB/pubdb-get-summary.php'
local_url_dictionary['CALT'] = 'http://cithep66.ultralight.org/PubDB-Prod/www/pubdb-get-summary.php'
#local_url_dictionary['NEBR'] = 'nebr-pubdb-get-summary.php.html'
#local_url_dictionary['WISC'] = 'wisc-pubdb-get-summary.php.html'
#local_url_dictionary['UFL'] = 'ufl-pubdb-get-summary.php.html'
#local_url_dictionary['PURDUE'] = 'purdue-pubdb-get-summary.php.html'
#local_url_dictionary['UCSD'] = 'ucsd-pubdb-get-summary.php.html'
#local_url_dictionary['CALT'] = 'caltech-pubdb-get-summary.php.html'
local_url = local_url_dictionary[site_para]
local_page = urllib.urlopen(local_url)
# local dictionary
local_dataset_dict = {}
# local_page = open(local_url)
line = local_page.readline()
counter = 0
while line :
line = line.replace('> <','><')
if line.find("
") > -1 :
row_array = line.split("
")
for row in row_array :
row = row.replace("
",'')
if row.find("| ") > -1 :
counter += 1
if debug == 2 :
print 'row:',counter,row
column_counter = 0
column_array = row.split(" | ")
dataset = ''
owner = ''
events = 0
for column in column_array :
column = column.replace(" | ","")
column = column.replace(" | ","")
if column != '' :
column_counter += 1
if debug == 2 :
print 'column:',column_counter,column
if column_counter == 1 :
dataset = column
if column_counter == 2 :
owner = column
if column_counter == 3 :
events = int(column)
if column_counter > 3 :
print 'LOCAL ERROR: More elements than expected in row',row
if dataset == '' or owner == '' or events == 0 :
print 'LOCAL ERROR: parsing into dataset,owner,events problen in row',row
else :
if dataset in local_dataset_dict.keys() :
local_owner_dict = local_dataset_dict[dataset]
if owner in local_owner_dict.keys():
print 'LOCAL ERROR: owner:',owner,' entry already existing for dataset:',dataset,'from row:',row
else:
local_owner_dict[owner] = events
else :
local_owner_dict = {}
local_owner_dict[owner] = events
local_dataset_dict[dataset] = local_owner_dict
line = local_page.readline()
# print local dataset
if debug == 1 :
for dataset in local_dataset_dict.keys() :
local_owner_dict = local_dataset_dict[dataset]
for owner in local_owner_dict.keys() :
events = local_owner_dict[owner]
print 'LOCAL DICT: dataset: %-40s owner: %-40s events: %10d' % (dataset,owner,events)
return local_dataset_dict
def extract_global(site_para,debug) :
global_url = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/GetPublishedCollectionInfoFromRefDB.mod.php'
global_page = urllib.urlopen(global_url)
# global dictionary
global_dataset_dict = {}
# global_page = open('publication_complete.html')
# global_page = open('publication.html')
line = global_page.readline()
row_counter = 0
while line :
line = line.strip()
if line.startswith("
") or line.startswith("
") :
row_counter += 1
dataset_line = global_page.readline()
dataset_line = dataset_line.strip()
dataset_line = dataset_line.replace("","")
dataset_line = dataset_line.replace("","")
dataset_line_array = dataset_line.split(">")
dataset = dataset_line_array[-1]
owner_line = global_page.readline()
owner_line = owner_line.strip()
owner_line = owner_line.replace("","")
owner_line = owner_line.replace("","")
owner_line_array = owner_line.split(">")
owner = owner_line_array[-1]
events_line = global_page.readline()
events_line = events_line.strip()
site_line = events_line
events_line = events_line.replace("| ","")
events_line_array = events_line.split(">")
events_line_array = events_line_array[0].split("<")
events = int(events_line_array[0])
site_line = site_line.replace(" | HTTP | ","")
site_line_array = site_line.split(">")
site = site_line_array[-1]
if debug == 2 :
print row_counter
print 'dataset:',dataset
print 'owner:',owner
print 'events:',events
print 'site:',site,
if site == site_para :
if dataset in global_dataset_dict.keys() :
global_owner_dict = global_dataset_dict[dataset]
if owner in global_owner_dict.keys():
print 'GLOBAL ERROR: owner:',owner,' entry already existing for dataset:',dataset,'from row:',row
else:
global_owner_dict[owner] = events
else :
global_owner_dict = {}
global_owner_dict[owner] = events
global_dataset_dict[dataset] = global_owner_dict
line = global_page.readline()
# print global dataset
if debug == 1 :
for dataset in global_dataset_dict.keys() :
global_owner_dict = global_dataset_dict[dataset]
for owner in global_owner_dict.keys() :
events = global_owner_dict[owner]
print 'GLOBAL DICT: dataset: %-40s owner: %-40s events: %10d' % (dataset,owner,events)
return global_dataset_dict
def main(argv) :
"""
compare_publication
compare publication between local and global services
required parameters:
--site : site: NEBR,WISC,UFL,PURDUE,UCSD,CALT
optional parameters:
--global : url of global catalog (default: http://cmsdoc.cern.ch/cms/production/www/PubDB/GetPublishedCollectionInfoFromRefDB.mod.php)
--help (-h) : help
--debug (-d) : debug statements
"""
# defaults
debug = 0
site_para = ''
try:
opts, args = getopt.getopt(argv, "", ["help", "debug", "site="])
except getopt.GetoptError:
print main.__doc__
sys.exit(2)
# check command line parameter
for opt, arg in opts :
if opt == "--help" :
print main.__doc__
sys.exit()
elif opt == "--site" :
site_para = arg
elif opt == "--global" :
global_url = arg
elif opt == "--debug" :
debug = 1
if site_para == '' :
print main.__doc__
sys.exit()
if site_para != 'NEBR' and site_para != 'WISC' and site_para != 'UFL' and site_para != 'PURDUE' and site_para != 'UCSD' and site_para != 'CALT' :
print main.__doc__
sys.exit()
print ''
print ' Compare global and local published datasets for site:',site_para
print ''
local_dict = extract_local(site_para,debug)
global_dict = extract_global(site_para,debug)
# local entries
dataset_counter = 0
owner_counter = 0
for dataset in local_dict.keys() :
dataset_counter += 1
owner_dict = local_dict[dataset]
for owner in owner_dict.keys() :
owner_counter += 1
print ' Site: %-7s publishes LOCALLY %5d datasets and %5d dataset,owner combinations' % (site_para,dataset_counter,owner_counter)
# global entries
dataset_counter = 0
owner_counter = 0
for dataset in global_dict.keys() :
dataset_counter += 1
owner_dict = global_dict[dataset]
for owner in owner_dict.keys() :
owner_counter += 1
print ' Site: %-7s publishes GLOBALLY %5d datasets and %5d dataset,owner combinations' % (site_para,dataset_counter,owner_counter)
# compare datasets
for dataset_local in local_dict.keys() :
owner_local_dict = local_dict[dataset_local]
if dataset_local not in global_dict.keys() :
print ' Dataset: %-40s %-40s from LOCAL service not found in GLOBAL service' % (dataset_local,'')
else :
owner_global_dict = global_dict[dataset_local]
for owner_local in owner_local_dict.keys() :
if owner_local not in owner_global_dict.keys():
print ' Dataset: %-40s Owner: %-40s from LOCAL service not found in GLOBAL service' % (dataset_local,owner_local)
for dataset_global in global_dict.keys() :
owner_global_dict = global_dict[dataset_global]
if dataset_global not in local_dict.keys() :
print ' Dataset: %-40s %-40s from GLOBAL service not found in LOCAL service' % (dataset_global,'')
else :
owner_local_dict = local_dict[dataset_global]
for owner_global in owner_global_dict.keys() :
if owner_global not in owner_local_dict.keys():
print ' Dataset: %-40s Owner: %-40s from GLOBAL service not found in LOCAL service' % (dataset_global,owner_global)
if __name__ == '__main__' :
main(sys.argv[1:])