From 81b3eee552d08d834e7258eaa20bb123cd7bf69d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Wed, 26 Nov 2014 16:17:16 +0100 Subject: [PATCH] Do a lot of things --- awstats_data.py | 12 ++ conf.py | 9 +- default_conf.py | 2 +- iwla.py | 22 +-- iwla_convert.pl | 87 ++++++++--- plugins/display/all_visits.py | 9 +- plugins/display/referers.py | 199 ++++++++++++++++++++++++++ plugins/display/top_visitors.py | 7 +- plugins/post_analysis/reverse_dns.py | 1 + plugins/post_analysis/top_visitors.py | 2 +- plugins/pre_analysis/robots.py | 23 +-- search_engines.py | 8 ++ 12 files changed, 333 insertions(+), 48 deletions(-) create mode 100644 awstats_data.py create mode 100644 plugins/display/referers.py create mode 100644 search_engines.py diff --git a/awstats_data.py b/awstats_data.py new file mode 100644 index 0000000..a6145f0 --- /dev/null +++ b/awstats_data.py @@ -0,0 +1,12 @@ +robots = ['.*appie.*', '.*architext.*', '.*jeeves.*', '.*bjaaland.*', '.*contentmatch.*', '.*ferret.*', '.*googlebot.*', '.*google\-sitemaps.*', '.*gulliver.*', '.*virus[_+ ]detector.*', '.*harvest.*', '.*htdig.*', '.*linkwalker.*', '.*lilina.*', '.*lycos[_+ ].*', '.*moget.*', '.*muscatferret.*', '.*myweb.*', '.*nomad.*', '.*scooter.*', '.*slurp.*', '.*^voyager\/.*', '.*weblayers.*', '.*antibot.*', '.*bruinbot.*', '.*digout4u.*', '.*echo!.*', '.*fast\-webcrawler.*', '.*ia_archiver\-web\.archive\.org.*', '.*ia_archiver.*', '.*jennybot.*', '.*mercator.*', '.*netcraft.*', '.*msnbot\-media.*', '.*msnbot.*', '.*petersnews.*', '.*relevantnoise\.com.*', '.*unlost_web_crawler.*', '.*voila.*', '.*webbase.*', '.*webcollage.*', '.*cfetch.*', '.*zyborg.*', '.*wisenutbot.*', '.*[^a]fish.*', '.*abcdatos.*', '.*acme\.spider.*', '.*ahoythehomepagefinder.*', '.*alkaline.*', '.*anthill.*', '.*arachnophilia.*', '.*arale.*', '.*araneo.*', '.*aretha.*', '.*ariadne.*', '.*powermarks.*', '.*arks.*', '.*aspider.*', '.*atn\.txt.*', '.*atomz.*', '.*auresys.*', '.*backrub.*', '.*bbot.*', '.*bigbrother.*', '.*blackwidow.*', '.*blindekuh.*', '.*bloodhound.*', '.*borg\-bot.*', '.*brightnet.*', '.*bspider.*', '.*cactvschemistryspider.*', '.*calif[^r].*', '.*cassandra.*', '.*cgireader.*', '.*checkbot.*', '.*christcrawler.*', '.*churl.*', '.*cienciaficcion.*', '.*collective.*', '.*combine.*', '.*conceptbot.*', '.*coolbot.*', '.*core.*', '.*cosmos.*', '.*cruiser.*', '.*cusco.*', '.*cyberspyder.*', '.*desertrealm.*', '.*deweb.*', '.*dienstspider.*', '.*digger.*', '.*diibot.*', '.*direct_hit.*', '.*dnabot.*', '.*download_express.*', '.*dragonbot.*', '.*dwcp.*', '.*e\-collector.*', '.*ebiness.*', '.*elfinbot.*', '.*emacs.*', '.*emcspider.*', '.*esther.*', '.*evliyacelebi.*', '.*fastcrawler.*', '.*feedcrawl.*', '.*fdse.*', '.*felix.*', '.*fetchrover.*', '.*fido.*', '.*finnish.*', '.*fireball.*', '.*fouineur.*', '.*francoroute.*', '.*freecrawl.*', '.*funnelweb.*', '.*gama.*', '.*gazz.*', '.*gcreep.*', '.*getbot.*', '.*geturl.*', '.*golem.*', '.*gougou.*', '.*grapnel.*', '.*griffon.*', '.*gromit.*', '.*gulperbot.*', '.*hambot.*', '.*havindex.*', '.*hometown.*', '.*htmlgobble.*', '.*hyperdecontextualizer.*', '.*iajabot.*', '.*iaskspider.*', '.*hl_ftien_spider.*', '.*sogou.*', '.*iconoclast.*', '.*ilse.*', '.*imagelock.*', '.*incywincy.*', '.*informant.*', '.*infoseek.*', '.*infoseeksidewinder.*', '.*infospider.*', '.*inspectorwww.*', '.*intelliagent.*', '.*irobot.*', '.*iron33.*', '.*israelisearch.*', '.*javabee.*', '.*jbot.*', '.*jcrawler.*', '.*jobo.*', '.*jobot.*', '.*joebot.*', '.*jubii.*', '.*jumpstation.*', '.*kapsi.*', '.*katipo.*', '.*kilroy.*', '.*ko[_+ ]yappo[_+ ]robot.*', '.*kummhttp.*', '.*labelgrabber\.txt.*', '.*larbin.*', '.*legs.*', '.*linkidator.*', '.*linkscan.*', '.*lockon.*', '.*logo_gif.*', '.*macworm.*', '.*magpie.*', '.*marvin.*', '.*mattie.*', '.*mediafox.*', '.*merzscope.*', '.*meshexplorer.*', '.*mindcrawler.*', '.*mnogosearch.*', '.*momspider.*', '.*monster.*', '.*motor.*', '.*muncher.*', '.*mwdsearch.*', '.*ndspider.*', '.*nederland\.zoek.*', '.*netcarta.*', '.*netmechanic.*', '.*netscoop.*', '.*newscan\-online.*', '.*nhse.*', '.*northstar.*', '.*nzexplorer.*', '.*objectssearch.*', '.*occam.*', '.*octopus.*', '.*openfind.*', '.*orb_search.*', '.*packrat.*', '.*pageboy.*', '.*parasite.*', '.*patric.*', '.*pegasus.*', '.*perignator.*', '.*perlcrawler.*', '.*phantom.*', '.*phpdig.*', '.*piltdownman.*', '.*pimptrain.*', '.*pioneer.*', '.*pitkow.*', '.*pjspider.*', '.*plumtreewebaccessor.*', '.*poppi.*', '.*portalb.*', '.*psbot.*', '.*python.*', '.*raven.*', '.*rbse.*', '.*resumerobot.*', '.*rhcs.*', '.*road_runner.*', '.*robbie.*', '.*robi.*', '.*robocrawl.*', '.*robofox.*', '.*robozilla.*', '.*roverbot.*', '.*rules.*', '.*safetynetrobot.*', '.*search\-info.*', '.*search_au.*', '.*searchprocess.*', '.*senrigan.*', '.*sgscout.*', '.*shaggy.*', '.*shaihulud.*', '.*sift.*', '.*simbot.*', '.*site\-valet.*', '.*sitetech.*', '.*skymob.*', '.*slcrawler.*', '.*smartspider.*', '.*snooper.*', '.*solbot.*', '.*speedy.*', '.*spider[_+ ]monkey.*', '.*spiderbot.*', '.*spiderline.*', '.*spiderman.*', '.*spiderview.*', '.*spry.*', '.*sqworm.*', '.*ssearcher.*', '.*suke.*', '.*sunrise.*', '.*suntek.*', '.*sven.*', '.*tach_bw.*', '.*tagyu_agent.*', '.*tailrank.*', '.*tarantula.*', '.*tarspider.*', '.*techbot.*', '.*templeton.*', '.*titan.*', '.*titin.*', '.*tkwww.*', '.*tlspider.*', '.*ucsd.*', '.*udmsearch.*', '.*universalfeedparser.*', '.*urlck.*', '.*valkyrie.*', '.*verticrawl.*', '.*victoria.*', '.*visionsearch.*', '.*voidbot.*', '.*vwbot.*', '.*w3index.*', '.*w3m2.*', '.*wallpaper.*', '.*wanderer.*', '.*wapspIRLider.*', '.*webbandit.*', '.*webcatcher.*', '.*webcopy.*', '.*webfetcher.*', '.*webfoot.*', '.*webinator.*', '.*weblinker.*', '.*webmirror.*', '.*webmoose.*', '.*webquest.*', '.*webreader.*', '.*webreaper.*', '.*websnarf.*', '.*webspider.*', '.*webvac.*', '.*webwalk.*', '.*webwalker.*', '.*webwatch.*', '.*whatuseek.*', '.*whowhere.*', '.*wired\-digital.*', '.*wmir.*', '.*wolp.*', '.*wombat.*', '.*wordpress.*', '.*worm.*', '.*woozweb.*', '.*wwwc.*', '.*wz101.*', '.*xget.*', '.*1\-more_scanner.*', '.*accoona\-ai\-agent.*', '.*activebookmark.*', '.*adamm_bot.*', '.*almaden.*', '.*aipbot.*', '.*aleadsoftbot.*', '.*alpha_search_agent.*', '.*allrati.*', '.*aport.*', '.*archive\.org_bot.*', '.*argus.*', '.*arianna\.libero\.it.*', '.*aspseek.*', '.*asterias.*', '.*awbot.*', '.*baiduspider.*', '.*becomebot.*', '.*bender.*', '.*betabot.*', '.*biglotron.*', '.*bittorrent_bot.*', '.*biz360[_+ ]spider.*', '.*blogbridge[_+ ]service.*', '.*bloglines.*', '.*blogpulse.*', '.*blogsearch.*', '.*blogshares.*', '.*blogslive.*', '.*blogssay.*', '.*bncf\.firenze\.sbn\.it\/raccolta\.txt.*', '.*bobby.*', '.*boitho\.com\-dc.*', '.*bookmark\-manager.*', '.*boris.*', '.*bumblebee.*', '.*candlelight[_+ ]favorites[_+ ]inspector.*', '.*cbn00glebot.*', '.*cerberian_drtrs.*', '.*cfnetwork.*', '.*cipinetbot.*', '.*checkweb_link_validator.*', '.*commons\-httpclient.*', '.*computer_and_automation_research_institute_crawler.*', '.*converamultimediacrawler.*', '.*converacrawler.*', '.*cscrawler.*', '.*cse_html_validator_lite_online.*', '.*cuasarbot.*', '.*cursor.*', '.*custo.*', '.*datafountains\/dmoz_downloader.*', '.*daviesbot.*', '.*daypopbot.*', '.*deepindex.*', '.*dipsie\.bot.*', '.*dnsgroup.*', '.*domainchecker.*', '.*domainsdb\.net.*', '.*dulance.*', '.*dumbot.*', '.*dumm\.de\-bot.*', '.*earthcom\.info.*', '.*easydl.*', '.*edgeio\-retriever.*', '.*ets_v.*', '.*exactseek.*', '.*extreme[_+ ]picture[_+ ]finder.*', '.*eventax.*', '.*everbeecrawler.*', '.*everest\-vulcan.*', '.*ezresult.*', '.*enteprise.*', '.*facebook.*', '.*fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de.*', '.*fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de.*', '.*matrix_s\.p\.a\._\-_fast_enterprise_crawler.*', '.*fast_enterprise_crawler.*', '.*fast\-search\-engine.*', '.*favicon.*', '.*favorg.*', '.*favorites_sweeper.*', '.*feedburner.*', '.*feedfetcher\-google.*', '.*feedflow.*', '.*feedster.*', '.*feedsky.*', '.*feedvalidator.*', '.*filmkamerabot.*', '.*findlinks.*', '.*findexa_crawler.*', '.*fooky\.com\/ScorpionBot.*', '.*g2crawler.*', '.*gaisbot.*', '.*geniebot.*', '.*gigabot.*', '.*girafabot.*', '.*global_fetch.*', '.*gnodspider.*', '.*goforit\.com.*', '.*goforitbot.*', '.*gonzo.*', '.*grub.*', '.*gpu_p2p_crawler.*', '.*henrythemiragorobot.*', '.*heritrix.*', '.*holmes.*', '.*hoowwwer.*', '.*hpprint.*', '.*htmlparser.*', '.*html[_+ ]link[_+ ]validator.*', '.*httrack.*', '.*hundesuche\.com\-bot.*', '.*ichiro.*', '.*iltrovatore\-setaccio.*', '.*infobot.*', '.*infociousbot.*', '.*infomine.*', '.*insurancobot.*', '.*internet[_+ ]ninja.*', '.*internetarchive.*', '.*internetseer.*', '.*internetsupervision.*', '.*irlbot.*', '.*isearch2006.*', '.*iupui_research_bot.*', '.*jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility.*', '.*justview.*', '.*kalambot.*', '.*kamano\.de_newsfeedverzeichnis.*', '.*kazoombot.*', '.*kevin.*', '.*keyoshid.*', '.*kinjabot.*', '.*kinja\-imagebot.*', '.*knowitall.*', '.*knowledge\.com.*', '.*kouaa_krawler.*', '.*krugle.*', '.*ksibot.*', '.*kurzor.*', '.*lanshanbot.*', '.*letscrawl\.com.*', '.*libcrawl.*', '.*linkbot.*', '.*link_valet_online.*', '.*metager\-linkchecker.*', '.*linkchecker.*', '.*livejournal\.com.*', '.*lmspider.*', '.*lwp\-request.*', '.*lwp\-trivial.*', '.*magpierss.*', '.*mail\.ru.*', '.*mapoftheinternet\.com.*', '.*mediapartners\-google.*', '.*megite.*', '.*metaspinner.*', '.*microsoft[_+ ]url[_+ ]control.*', '.*mini\-reptile.*', '.*minirank.*', '.*missigua_locator.*', '.*misterbot.*', '.*miva.*', '.*mizzu_labs.*', '.*mj12bot.*', '.*mojeekbot.*', '.*msiecrawler.*', '.*ms_search_4\.0_robot.*', '.*msrabot.*', '.*msrbot.*', '.*mt::telegraph::agent.*', '.*nagios.*', '.*nasa_search.*', '.*mydoyouhike.*', '.*netluchs.*', '.*netsprint.*', '.*newsgatoronline.*', '.*nicebot.*', '.*nimblecrawler.*', '.*noxtrumbot.*', '.*npbot.*', '.*nutchcvs.*', '.*nutchosu\-vlib.*', '.*nutch.*', '.*ocelli.*', '.*octora_beta_bot.*', '.*omniexplorer[_+ ]bot.*', '.*onet\.pl[_+ ]sa.*', '.*onfolio.*', '.*opentaggerbot.*', '.*openwebspider.*', '.*oracle_ultra_search.*', '.*orbiter.*', '.*yodaobot.*', '.*qihoobot.*', '.*passwordmaker\.org.*', '.*pear_http_request_class.*', '.*peerbot.*', '.*perman.*', '.*php[_+ ]version[_+ ]tracker.*', '.*pictureofinternet.*', '.*ping\.blo\.gs.*', '.*plinki.*', '.*pluckfeedcrawler.*', '.*pogodak.*', '.*pompos.*', '.*popdexter.*', '.*port_huron_labs.*', '.*postfavorites.*', '.*projectwf\-java\-test\-crawler.*', '.*proodlebot.*', '.*pyquery.*', '.*rambler.*', '.*redalert.*', '.*rojo.*', '.*rssimagesbot.*', '.*ruffle.*', '.*rufusbot.*', '.*sandcrawler.*', '.*sbider.*', '.*schizozilla.*', '.*scumbot.*', '.*searchguild[_+ ]dmoz[_+ ]experiment.*', '.*seekbot.*', '.*sensis_web_crawler.*', '.*seznambot.*', '.*shim\-crawler.*', '.*shoutcast.*', '.*slysearch.*', '.*snap\.com_beta_crawler.*', '.*sohu\-search.*', '.*sohu.*', '.*snappy.*', '.*sphere_scout.*', '.*spip.*', '.*sproose_crawler.*', '.*steeler.*', '.*steroid__download.*', '.*suchfin\-bot.*', '.*superbot.*', '.*surveybot.*', '.*susie.*', '.*syndic8.*', '.*syndicapi.*', '.*synoobot.*', '.*tcl_http_client_package.*', '.*technoratibot.*', '.*teragramcrawlersurf.*', '.*test_crawler.*', '.*testbot.*', '.*t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e.*', '.*topicblogs.*', '.*turnitinbot.*', '.*turtlescanner.*', '.*turtle.*', '.*tutorgigbot.*', '.*twiceler.*', '.*ubicrawler.*', '.*ultraseek.*', '.*unchaos_bot_hybrid_web_search_engine.*', '.*unido\-bot.*', '.*updated.*', '.*ustc\-semantic\-group.*', '.*vagabondo\-wap.*', '.*vagabondo.*', '.*vermut.*', '.*versus_crawler_from_eda\.baykan@epfl\.ch.*', '.*vespa_crawler.*', '.*vortex.*', '.*vse\/.*', '.*w3c\-checklink.*', '.*w3c[_+ ]css[_+ ]validator[_+ ]jfouffa.*', '.*w3c_validator.*', '.*watchmouse.*', '.*wavefire.*', '.*webclipping\.com.*', '.*webcompass.*', '.*webcrawl\.net.*', '.*web_downloader.*', '.*webdup.*', '.*webfilter.*', '.*webindexer.*', '.*webminer.*', '.*website[_+ ]monitoring[_+ ]bot.*', '.*webvulncrawl.*', '.*wells_search.*', '.*wonderer.*', '.*wume_crawler.*', '.*wwweasel.*', '.*xenu\'s_link_sleuth.*', '.*xenu_link_sleuth.*', '.*xirq.*', '.*y!j.*', '.*yacy.*', '.*yahoo\-blogs.*', '.*yahoo\-verticalcrawler.*', '.*yahoofeedseeker.*', '.*yahooseeker\-testing.*', '.*yahooseeker.*', '.*yahoo\-mmcrawler.*', '.*yahoo!_mindset.*', '.*yandex.*', '.*flexum.*', '.*yanga.*', '.*yooglifetchagent.*', '.*z\-add_link_checker.*', '.*zealbot.*', '.*zhuaxia.*', '.*zspider.*', '.*zeus.*', '.*ng\/1\..*', '.*ng\/2\..*', '.*exabot.*', '.*wget.*', '.*libwww.*', '.*java\/[0-9].*'] + +search_engines = ['.*google\.[\w.]+/products.*', '.*base\.google\..*', '.*froogle\.google\..*', '.*groups\.google\..*', '.*images\.google\..*', '.*google\..*', '.*googlee\..*', '.*googlecom\.com.*', '.*goggle\.co\.hu.*', '.*216\.239\.(35|37|39|51)\.100.*', '.*216\.239\.(35|37|39|51)\.101.*', '.*216\.239\.5[0-9]\.104.*', '.*64\.233\.1[0-9]{2}\.104.*', '.*66\.102\.[1-9]\.104.*', '.*66\.249\.93\.104.*', '.*72\.14\.2[0-9]{2}\.104.*', '.*msn\..*', '.*live\.com.*', '.*bing\..*', '.*voila\..*', '.*mindset\.research\.yahoo.*', '.*yahoo\..*', '.*(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11).*', '.*search\.aol\.co.*', '.*tiscali\..*', '.*lycos\..*', '.*alexa\.com.*', '.*alltheweb\.com.*', '.*altavista\..*', '.*a9\.com.*', '.*dmoz\.org.*', '.*netscape\..*', '.*search\.terra\..*', '.*www\.search\.com.*', '.*search\.sli\.sympatico\.ca.*', '.*excite\..*'] + +search_engines_2 = ['.*4\-counter\.com.*', '.*att\.net.*', '.*bungeebonesdotcom.*', '.*northernlight\..*', '.*hotbot\..*', '.*kvasir\..*', '.*webcrawler\..*', '.*metacrawler\..*', '.*go2net\.com.*', '.*(^|\.)go\.com.*', '.*euroseek\..*', '.*looksmart\..*', '.*spray\..*', '.*nbci\.com\/search.*', '.*de\.ask.\com.*', '.*es\.ask.\com.*', '.*fr\.ask.\com.*', '.*it\.ask.\com.*', '.*nl\.ask.\com.*', '.*uk\.ask.\com.*', '.*(^|\.)ask\.com.*', '.*atomz\..*', '.*overture\.com.*', '.*teoma\..*', '.*findarticles\.com.*', '.*infospace\.com.*', '.*mamma\..*', '.*dejanews\..*', '.*dogpile\.com.*', '.*wisenut\.com.*', '.*ixquick\.com.*', '.*search\.earthlink\.net.*', '.*i-une\.com.*', '.*blingo\.com.*', '.*centraldatabase\.org.*', '.*clusty\.com.*', '.*mysearch\..*', '.*vivisimo\.com.*', '.*kartoo\.com.*', '.*icerocket\.com.*', '.*sphere\.com.*', '.*ledix\.net.*', '.*start\.shaw\.ca.*', '.*searchalot\.com.*', '.*copernic\.com.*', '.*avantfind\.com.*', '.*steadysearch\.com.*', '.*steady-search\.com.*', '.*chello\.at.*', '.*chello\.be.*', '.*chello\.cz.*', '.*chello\.fr.*', '.*chello\.hu.*', '.*chello\.nl.*', '.*chello\.no.*', '.*chello\.pl.*', '.*chello\.se.*', '.*chello\.sk.*', '.*chello.*', '.*mirago\.be.*', '.*mirago\.ch.*', '.*mirago\.de.*', '.*mirago\.dk.*', '.*es\.mirago\.com.*', '.*mirago\.fr.*', '.*mirago\.it.*', '.*mirago\.nl.*', '.*no\.mirago\.com.*', '.*mirago\.se.*', '.*mirago\.co\.uk.*', '.*mirago.*', '.*answerbus\.com.*', '.*icq\.com\/search.*', '.*nusearch\.com.*', '.*goodsearch\.com.*', '.*scroogle\.org.*', '.*questionanswering\.com.*', '.*mywebsearch\.com.*', '.*as\.starware\.com.*', '.*del\.icio\.us.*', '.*digg\.com.*', '.*stumbleupon\.com.*', '.*swik\.net.*', '.*segnalo\.alice\.it.*', '.*ineffabile\.it.*', '.*anzwers\.com\.au.*', '.*engine\.exe.*', '.*miner\.bol\.com\.br.*', '.*\.baidu\.com.*', '.*\.vnet\.cn.*', '.*\.soso\.com.*', '.*\.sogou\.com.*', '.*\.3721\.com.*', '.*iask\.com.*', '.*\.accoona\.com.*', '.*\.163\.com.*', '.*\.zhongsou\.com.*', '.*atlas\.cz.*', '.*seznam\.cz.*', '.*quick\.cz.*', '.*centrum\.cz.*', '.*jyxo\.(cz|com).*', '.*najdi\.to.*', '.*redbox\.cz.*', '.*opasia\.dk.*', '.*danielsen\.com.*', '.*sol\.dk.*', '.*jubii\.dk.*', '.*find\.dk.*', '.*edderkoppen\.dk.*', '.*netstjernen\.dk.*', '.*orbis\.dk.*', '.*tyfon\.dk.*', '.*1klik\.dk.*', '.*ofir\.dk.*', '.*ilse\..*', '.*vindex\..*', '.*(^|\.)ask\.co\.uk.*', '.*bbc\.co\.uk/cgi-bin/search.*', '.*ifind\.freeserve.*', '.*looksmart\.co\.uk.*', '.*splut\..*', '.*spotjockey\..*', '.*ukdirectory\..*', '.*ukindex\.co\.uk.*', '.*ukplus\..*', '.*searchy\.co\.uk.*', '.*haku\.www\.fi.*', '.*recherche\.aol\.fr.*', '.*ctrouve\..*', '.*francite\..*', '.*\.lbb\.org.*', '.*rechercher\.libertysurf\.fr.*', '.*search[\w\-]+\.free\.fr.*', '.*recherche\.club-internet\.fr.*', '.*toile\.com.*', '.*biglotron\.com.*', '.*mozbot\.fr.*', '.*sucheaol\.aol\.de.*', '.*fireball\.de.*', '.*infoseek\.de.*', '.*suche\d?\.web\.de.*', '.*[a-z]serv\.rrzn\.uni-hannover\.de.*', '.*suchen\.abacho\.de.*', '.*(brisbane|suche)\.t-online\.de.*', '.*allesklar\.de.*', '.*meinestadt\.de.*', '.*212\.227\.33\.241.*', '.*(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42).*', '.*wwweasel\.de.*', '.*netluchs\.de.*', '.*schoenerbrausen\.de.*', '.*heureka\.hu.*', '.*vizsla\.origo\.hu.*', '.*lapkereso\.hu.*', '.*goliat\.hu.*', '.*index\.hu.*', '.*wahoo\.hu.*', '.*webmania\.hu.*', '.*search\.internetto\.hu.*', '.*tango\.hu.*', '.*keresolap\.hu.*', '.*polymeta\.hu.*', '.*sify\.com.*', '.*virgilio\.it.*', '.*arianna\.libero\.it.*', '.*supereva\.com.*', '.*kataweb\.it.*', '.*search\.alice\.it\.master.*', '.*search\.alice\.it.*', '.*gotuneed\.com.*', '.*godado.*', '.*jumpy\.it.*', '.*shinyseek\.it.*', '.*teecno\.it.*', '.*ask\.jp.*', '.*sagool\.jp.*', '.*sok\.start\.no.*', '.*eniro\.no.*', '.*szukaj\.wp\.pl.*', '.*szukaj\.onet\.pl.*', '.*dodaj\.pl.*', '.*gazeta\.pl.*', '.*gery\.pl.*', '.*hoga\.pl.*', '.*netsprint\.pl.*', '.*interia\.pl.*', '.*katalog\.onet\.pl.*', '.*o2\.pl.*', '.*polska\.pl.*', '.*szukacz\.pl.*', '.*wow\.pl.*', '.*ya(ndex)?\.ru.*', '.*aport\.ru.*', '.*rambler\.ru.*', '.*turtle\.ru.*', '.*metabot\.ru.*', '.*evreka\.passagen\.se.*', '.*eniro\.se.*', '.*zoznam\.sk.*', '.*sapo\.pt.*', '.*search\.ch.*', '.*search\.bluewin\.ch.*', '.*pogodak\..*'] + +not_search_engines_keys = {'.*yahoo\..*' : '(?:picks|mail)\.yahoo\.|yahoo\.[^/]+/picks', '.*altavista\..*' : 'babelfish\.altavista\.', '.*tiscali\..*' : 'mail\.tiscali\.', '.*yandex\..*' : 'direct\.yandex\.', '.*google\..*' : 'translate\.google\.', '.*msn\..*' : 'hotmail\.msn\.'} + +search_engines_hashid = {'.*search\.sli\.sympatico\.ca.*' : 'sympatico', '.*mywebsearch\.com.*' : 'mywebsearch', '.*netsprint\.pl\/hoga\-search.*' : 'hogapl', '.*findarticles\.com.*' : 'findarticles', '.*wow\.pl.*' : 'wowpl', '.*allesklar\.de.*' : 'allesklar', '.*atomz\..*' : 'atomz', '.*bing\..*' : 'bing', '.*find\.dk.*' : 'finddk', '.*google\..*' : 'google', '.*(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11).*' : 'yahoo', '.*pogodak\..*' : 'pogodak', '.*ask\.jp.*' : 'askjp', '.*\.baidu\.com.*' : 'baidu', '.*tango\.hu.*' : 'tango_hu', '.*gotuneed\.com.*' : 'gotuneed', '.*quick\.cz.*' : 'quick', '.*mirago.*' : 'mirago', '.*szukaj\.wp\.pl.*' : 'wp', '.*mirago\.de.*' : 'miragode', '.*mirago\.dk.*' : 'miragodk', '.*katalog\.onet\.pl.*' : 'katalogonetpl', '.*googlee\..*' : 'google', '.*orbis\.dk.*' : 'orbis', '.*turtle\.ru.*' : 'turtle', '.*zoznam\.sk.*' : 'zoznam', '.*start\.shaw\.ca.*' : 'shawca', '.*chello\.at.*' : 'chelloat', '.*centraldatabase\.org.*' : 'centraldatabase', '.*centrum\.cz.*' : 'centrum', '.*kataweb\.it.*' : 'kataweb', '.*\.lbb\.org.*' : 'lbb', '.*blingo\.com.*' : 'blingo', '.*vivisimo\.com.*' : 'vivisimo', '.*stumbleupon\.com.*' : 'stumbleupon', '.*es\.ask.\com.*' : 'askes', '.*interia\.pl.*' : 'interiapl', '.*[a-z]serv\.rrzn\.uni-hannover\.de.*' : 'meta', '.*search\.alice\.it.*' : 'aliceit', '.*shinyseek\.it.*' : 'shinyseek\.it', '.*i-une\.com.*' : 'iune', '.*dejanews\..*' : 'dejanews', '.*opasia\.dk.*' : 'opasia', '.*chello\.cz.*' : 'chellocz', '.*ya(ndex)?\.ru.*' : 'yandex', '.*kartoo\.com.*' : 'kartoo', '.*arianna\.libero\.it.*' : 'arianna', '.*ofir\.dk.*' : 'ofir', '.*search\.earthlink\.net.*' : 'earthlink', '.*biglotron\.com.*' : 'biglotron', '.*lapkereso\.hu.*' : 'lapkereso', '.*216\.239\.(35|37|39|51)\.101.*' : 'google_cache', '.*miner\.bol\.com\.br.*' : 'miner', '.*dodaj\.pl.*' : 'dodajpl', '.*mirago\.be.*' : 'miragobe', '.*googlecom\.com.*' : 'google', '.*steadysearch\.com.*' : 'steadysearch', '.*redbox\.cz.*' : 'redbox', '.*haku\.www\.fi.*' : 'haku', '.*sapo\.pt.*' : 'sapo', '.*sphere\.com.*' : 'sphere', '.*danielsen\.com.*' : 'danielsen', '.*alexa\.com.*' : 'alexa', '.*mamma\..*' : 'mamma', '.*swik\.net.*' : 'swik', '.*polska\.pl.*' : 'polskapl', '.*groups\.google\..*' : 'google_groups', '.*metabot\.ru.*' : 'metabot', '.*rechercher\.libertysurf\.fr.*' : 'libertysurf', '.*szukaj\.onet\.pl.*' : 'onetpl', '.*aport\.ru.*' : 'aport', '.*de\.ask.\com.*' : 'askde', '.*splut\..*' : 'splut', '.*live\.com.*' : 'live', '.*216\.239\.5[0-9]\.104.*' : 'google_cache', '.*mysearch\..*' : 'mysearch', '.*ukplus\..*' : 'ukplus', '.*najdi\.to.*' : 'najdi', '.*overture\.com.*' : 'overture', '.*iask\.com.*' : 'iask', '.*nl\.ask.\com.*' : 'asknl', '.*nbci\.com\/search.*' : 'nbci', '.*search\.aol\.co.*' : 'aol', '.*eniro\.se.*' : 'enirose', '.*64\.233\.1[0-9]{2}\.104.*' : 'google_cache', '.*mirago\.ch.*' : 'miragoch', '.*altavista\..*' : 'altavista', '.*chello\.hu.*' : 'chellohu', '.*mozbot\.fr.*' : 'mozbot', '.*northernlight\..*' : 'northernlight', '.*mirago\.co\.uk.*' : 'miragocouk', '.*search[\w\-]+\.free\.fr.*' : 'free', '.*mindset\.research\.yahoo.*' : 'yahoo_mindset', '.*copernic\.com.*' : 'copernic', '.*heureka\.hu.*' : 'heureka', '.*steady-search\.com.*' : 'steadysearch', '.*teecno\.it.*' : 'teecnoit', '.*voila\..*' : 'voila', '.*netstjernen\.dk.*' : 'netstjernen', '.*keresolap\.hu.*' : 'keresolap_hu', '.*yahoo\..*' : 'yahoo', '.*icerocket\.com.*' : 'icerocket', '.*alltheweb\.com.*' : 'alltheweb', '.*www\.search\.com.*' : 'search.com', '.*digg\.com.*' : 'digg', '.*tiscali\..*' : 'tiscali', '.*spotjockey\..*' : 'spotjockey', '.*a9\.com.*' : 'a9', '.*(brisbane|suche)\.t-online\.de.*' : 't-online', '.*ifind\.freeserve.*' : 'freeserve', '.*att\.net.*' : 'att', '.*mirago\.it.*' : 'miragoit', '.*index\.hu.*' : 'indexhu', '.*\.sogou\.com.*' : 'sogou', '.*no\.mirago\.com.*' : 'miragono', '.*ineffabile\.it.*' : 'ineffabile', '.*netluchs\.de.*' : 'netluchs', '.*toile\.com.*' : 'toile', '.*search\..*\.\w+.*' : 'search', '.*del\.icio\.us.*' : 'delicious', '.*vizsla\.origo\.hu.*' : 'origo', '.*netscape\..*' : 'netscape', '.*dogpile\.com.*' : 'dogpile', '.*anzwers\.com\.au.*' : 'anzwers', '.*\.zhongsou\.com.*' : 'zhongsou', '.*ctrouve\..*' : 'ctrouve', '.*gazeta\.pl.*' : 'gazetapl', '.*recherche\.club-internet\.fr.*' : 'clubinternet', '.*sok\.start\.no.*' : 'start', '.*scroogle\.org.*' : 'scroogle', '.*schoenerbrausen\.de.*' : 'schoenerbrausen', '.*looksmart\.co\.uk.*' : 'looksmartuk', '.*wwweasel\.de.*' : 'wwweasel', '.*godado.*' : 'godado', '.*216\.239\.(35|37|39|51)\.100.*' : 'google_cache', '.*jubii\.dk.*' : 'jubii', '.*212\.227\.33\.241.*' : 'metaspinner', '.*mirago\.fr.*' : 'miragofr', '.*sol\.dk.*' : 'sol', '.*bbc\.co\.uk/cgi-bin/search.*' : 'bbc', '.*jumpy\.it.*' : 'jumpy\.it', '.*francite\..*' : 'francite', '.*infoseek\.de.*' : 'infoseek', '.*es\.mirago\.com.*' : 'miragoes', '.*jyxo\.(cz|com).*' : 'jyxo', '.*hotbot\..*' : 'hotbot', '.*engine\.exe.*' : 'engine', '.*(^|\.)ask\.com.*' : 'ask', '.*goliat\.hu.*' : 'goliat', '.*wisenut\.com.*' : 'wisenut', '.*mirago\.nl.*' : 'miragonl', '.*base\.google\..*' : 'google_base', '.*search\.bluewin\.ch.*' : 'bluewin', '.*lycos\..*' : 'lycos', '.*meinestadt\.de.*' : 'meinestadt', '.*4\-counter\.com.*' : 'google4counter', '.*search\.alice\.it\.master.*' : 'aliceitmaster', '.*teoma\..*' : 'teoma', '.*(^|\.)ask\.co\.uk.*' : 'askuk', '.*tyfon\.dk.*' : 'tyfon', '.*froogle\.google\..*' : 'google_froogle', '.*ukdirectory\..*' : 'ukdirectory', '.*ledix\.net.*' : 'ledix', '.*edderkoppen\.dk.*' : 'edderkoppen', '.*recherche\.aol\.fr.*' : 'aolfr', '.*google\.[\w.]+/products.*' : 'google_products', '.*webmania\.hu.*' : 'webmania', '.*searchy\.co\.uk.*' : 'searchy', '.*fr\.ask.\com.*' : 'askfr', '.*spray\..*' : 'spray', '.*72\.14\.2[0-9]{2}\.104.*' : 'google_cache', '.*eniro\.no.*' : 'eniro', '.*goodsearch\.com.*' : 'goodsearch', '.*kvasir\..*' : 'kvasir', '.*\.accoona\.com.*' : 'accoona', '.*\.soso\.com.*' : 'soso', '.*as\.starware\.com.*' : 'comettoolbar', '.*virgilio\.it.*' : 'virgilio', '.*o2\.pl.*' : 'o2pl', '.*chello\.nl.*' : 'chellonl', '.*chello\.be.*' : 'chellobe', '.*icq\.com\/search.*' : 'icq', '.*msn\..*' : 'msn', '.*fireball\.de.*' : 'fireball', '.*sucheaol\.aol\.de.*' : 'aolde', '.*uk\.ask.\com.*' : 'askuk', '.*euroseek\..*' : 'euroseek', '.*gery\.pl.*' : 'gerypl', '.*chello\.fr.*' : 'chellofr', '.*netsprint\.pl.*' : 'netsprintpl', '.*avantfind\.com.*' : 'avantfind', '.*supereva\.com.*' : 'supereva', '.*polymeta\.hu.*' : 'polymeta_hu', '.*infospace\.com.*' : 'infospace', '.*sify\.com.*' : 'sify', '.*go2net\.com.*' : 'go2net', '.*wahoo\.hu.*' : 'wahoo', '.*suche\d?\.web\.de.*' : 'webde', '.*(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42).*' : 'metacrawler_de', '.*\.3721\.com.*' : '3721', '.*ilse\..*' : 'ilse', '.*metacrawler\..*' : 'metacrawler', '.*sagool\.jp.*' : 'sagool', '.*atlas\.cz.*' : 'atlas', '.*vindex\..*' : 'vindex', '.*ixquick\.com.*' : 'ixquick', '.*66\.102\.[1-9]\.104.*' : 'google_cache', '.*rambler\.ru.*' : 'rambler', '.*answerbus\.com.*' : 'answerbus', '.*evreka\.passagen\.se.*' : 'passagen', '.*chello\.se.*' : 'chellose', '.*clusty\.com.*' : 'clusty', '.*search\.ch.*' : 'searchch', '.*chello\.no.*' : 'chellono', '.*searchalot\.com.*' : 'searchalot', '.*questionanswering\.com.*' : 'questionanswering', '.*seznam\.cz.*' : 'seznam', '.*ukindex\.co\.uk.*' : 'ukindex', '.*dmoz\.org.*' : 'dmoz', '.*excite\..*' : 'excite', '.*chello\.pl.*' : 'chellopl', '.*looksmart\..*' : 'looksmart', '.*1klik\.dk.*' : '1klik', '.*\.vnet\.cn.*' : 'vnet', '.*chello\.sk.*' : 'chellosk', '.*(^|\.)go\.com.*' : 'go', '.*nusearch\.com.*' : 'nusearch', '.*it\.ask.\com.*' : 'askit', '.*bungeebonesdotcom.*' : 'bungeebonesdotcom', '.*search\.terra\..*' : 'terra', '.*webcrawler\..*' : 'webcrawler', '.*suchen\.abacho\.de.*' : 'abacho', '.*szukacz\.pl.*' : 'szukaczpl', '.*66\.249\.93\.104.*' : 'google_cache', '.*search\.internetto\.hu.*' : 'internetto', '.*goggle\.co\.hu.*' : 'google', '.*mirago\.se.*' : 'miragose', '.*images\.google\..*' : 'google_image', '.*segnalo\.alice\.it.*' : 'segnalo', '.*\.163\.com.*' : 'netease', '.*chello.*' : 'chellocom'} + +search_engines_knwown_url = {'.*dmoz.*' : 'search=', '.*google.*' : '(p|q|as_p|as_q)=', '.*searchalot.*' : 'q=', '.*teoma.*' : 'q=', '.*looksmartuk.*' : 'key=', '.*polymeta_hu.*' : '', '.*google_groups.*' : 'group\/', '.*iune.*' : '(keywords|q)=', '.*chellosk.*' : 'q1=', '.*eniro.*' : 'q=', '.*msn.*' : 'q=', '.*webcrawler.*' : 'searchText=', '.*mirago.*' : '(txtsearch|qry)=', '.*enirose.*' : 'q=', '.*miragobe.*' : '(txtsearch|qry)=', '.*netease.*' : 'q=', '.*netluchs.*' : 'query=', '.*google_products.*' : '(p|q|as_p|as_q)=', '.*jyxo.*' : '(s|q)=', '.*origo.*' : '(q|search)=', '.*ilse.*' : 'search_for=', '.*chellocom.*' : 'q1=', '.*goodsearch.*' : 'Keywords=', '.*ledix.*' : 'q=', '.*mozbot.*' : 'q=', '.*chellocz.*' : 'q1=', '.*webde.*' : 'su=', '.*biglotron.*' : 'question=', '.*metacrawler_de.*' : 'qry=', '.*finddk.*' : 'words=', '.*start.*' : 'q=', '.*sagool.*' : 'q=', '.*miragoch.*' : '(txtsearch|qry)=', '.*google_base.*' : '(p|q|as_p|as_q)=', '.*aliceit.*' : 'qs=', '.*shinyseek\.it.*' : 'KEY=', '.*onetpl.*' : 'qt=', '.*clusty.*' : 'query=', '.*chellonl.*' : 'q1=', '.*miragode.*' : '(txtsearch|qry)=', '.*miragose.*' : '(txtsearch|qry)=', '.*o2pl.*' : 'qt=', '.*goliat.*' : 'KERESES=', '.*kvasir.*' : 'q=', '.*askfr.*' : '(ask|q)=', '.*infoseek.*' : 'qt=', '.*yahoo_mindset.*' : 'p=', '.*comettoolbar.*' : 'qry=', '.*alltheweb.*' : 'q(|uery)=', '.*miner.*' : 'q=', '.*aol.*' : 'query=', '.*rambler.*' : 'words=', '.*scroogle.*' : 'Gw=', '.*chellose.*' : 'q1=', '.*ineffabile.*' : '', '.*miragoit.*' : '(txtsearch|qry)=', '.*yandex.*' : 'text=', '.*segnalo.*' : '', '.*dodajpl.*' : 'keyword=', '.*avantfind.*' : 'keywords=', '.*nusearch.*' : 'nusearch_terms=', '.*bbc.*' : 'q=', '.*supereva.*' : 'q=', '.*atomz.*' : 'sp-q=', '.*searchy.*' : 'search_term=', '.*dogpile.*' : 'q(|kw)=', '.*chellohu.*' : 'q1=', '.*vnet.*' : 'kw=', '.*1klik.*' : 'query=', '.*t-online.*' : 'q=', '.*hogapl.*' : 'qt=', '.*stumbleupon.*' : '', '.*soso.*' : 'q=', '.*zhongsou.*' : '(word|w)=', '.*a9.*' : 'a9\.com\/', '.*centraldatabase.*' : 'query=', '.*mamma.*' : 'query=', '.*icerocket.*' : 'q=', '.*ask.*' : '(ask|q)=', '.*chellobe.*' : 'q1=', '.*altavista.*' : 'q=', '.*vindex.*' : 'in=', '.*miragodk.*' : '(txtsearch|qry)=', '.*chelloat.*' : 'q1=', '.*digg.*' : 's=', '.*metacrawler.*' : 'general=', '.*nbci.*' : 'keyword=', '.*chellono.*' : 'q1=', '.*icq.*' : 'q=', '.*arianna.*' : 'query=', '.*miragocouk.*' : '(txtsearch|qry)=', '.*3721.*' : '(p|name)=', '.*pogodak.*' : 'q=', '.*ukdirectory.*' : 'k=', '.*overture.*' : 'keywords=', '.*heureka.*' : 'heureka=', '.*teecnoit.*' : 'q=', '.*miragoes.*' : '(txtsearch|qry)=', '.*haku.*' : 'w=', '.*go.*' : 'qt=', '.*fireball.*' : 'q=', '.*wisenut.*' : 'query=', '.*sify.*' : 'keyword=', '.*ixquick.*' : 'query=', '.*anzwers.*' : 'search=', '.*quick.*' : 'query=', '.*jubii.*' : 'soegeord=', '.*questionanswering.*' : '', '.*asknl.*' : '(ask|q)=', '.*askde.*' : '(ask|q)=', '.*att.*' : 'qry=', '.*terra.*' : 'query=', '.*bing.*' : 'q=', '.*wowpl.*' : 'q=', '.*freeserve.*' : 'q=', '.*atlas.*' : '(searchtext|q)=', '.*askuk.*' : '(ask|q)=', '.*godado.*' : 'Keywords=', '.*northernlight.*' : 'qr=', '.*answerbus.*' : '', '.*search.com.*' : 'q=', '.*google_image.*' : '(p|q|as_p|as_q)=', '.*jumpy\.it.*' : 'searchWord=', '.*gazetapl.*' : 'slowo=', '.*yahoo.*' : 'p=', '.*hotbot.*' : 'mt=', '.*metabot.*' : 'st=', '.*copernic.*' : 'web\/', '.*kartoo.*' : '', '.*metaspinner.*' : 'qry=', '.*toile.*' : 'q=', '.*aolde.*' : 'q=', '.*blingo.*' : 'q=', '.*askit.*' : '(ask|q)=', '.*netscape.*' : 'search=', '.*splut.*' : 'pattern=', '.*looksmart.*' : 'key=', '.*sphere.*' : 'q=', '.*sol.*' : 'q=', '.*miragono.*' : '(txtsearch|qry)=', '.*kataweb.*' : 'q=', '.*ofir.*' : 'querytext=', '.*aliceitmaster.*' : 'qs=', '.*miragofr.*' : '(txtsearch|qry)=', '.*spray.*' : 'string=', '.*seznam.*' : '(w|q)=', '.*interiapl.*' : 'q=', '.*euroseek.*' : 'query=', '.*schoenerbrausen.*' : 'q=', '.*centrum.*' : 'q=', '.*netsprintpl.*' : 'q=', '.*go2net.*' : 'general=', '.*katalogonetpl.*' : 'qt=', '.*ukindex.*' : 'stext=', '.*shawca.*' : 'q=', '.*szukaczpl.*' : 'q=', '.*accoona.*' : 'qt=', '.*live.*' : 'q=', '.*google4counter.*' : '(p|q|as_p|as_q)=', '.*iask.*' : '(w|k)=', '.*earthlink.*' : 'q=', '.*tiscali.*' : 'key=', '.*askes.*' : '(ask|q)=', '.*gotuneed.*' : '', '.*clubinternet.*' : 'q=', '.*redbox.*' : 'srch=', '.*delicious.*' : 'all=', '.*chellofr.*' : 'q1=', '.*lycos.*' : 'query=', '.*sympatico.*' : 'query=', '.*vivisimo.*' : 'query=', '.*bluewin.*' : 'qry=', '.*mysearch.*' : 'searchfor=', '.*google_cache.*' : '(p|q|as_p|as_q)=cache:[0-9A-Za-z]{12}:', '.*ukplus.*' : 'search=', '.*gerypl.*' : 'q=', '.*keresolap_hu.*' : 'q=', '.*abacho.*' : 'q=', '.*engine.*' : 'p1=', '.*opasia.*' : 'q=', '.*wp.*' : 'szukaj=', '.*steadysearch.*' : 'w=', '.*chellopl.*' : 'q1=', '.*voila.*' : '(kw|rdata)=', '.*aport.*' : 'r=', '.*internetto.*' : 'searchstr=', '.*passagen.*' : 'q=', '.*wwweasel.*' : 'q=', '.*najdi.*' : 'dotaz=', '.*alexa.*' : 'q=', '.*baidu.*' : '(wd|word)=', '.*spotjockey.*' : 'Search_Keyword=', '.*virgilio.*' : 'qs=', '.*orbis.*' : 'search_field=', '.*tango_hu.*' : 'q=', '.*askjp.*' : '(ask|q)=', '.*bungeebonesdotcom.*' : 'query=', '.*francite.*' : 'name=', '.*searchch.*' : 'q=', '.*google_froogle.*' : '(p|q|as_p|as_q)=', '.*excite.*' : 'search=', '.*infospace.*' : 'qkw=', '.*polskapl.*' : 'qt=', '.*swik.*' : 'swik\.net/', '.*edderkoppen.*' : 'query=', '.*mywebsearch.*' : 'searchfor=', '.*danielsen.*' : 'q=', '.*wahoo.*' : 'q=', '.*sogou.*' : 'query=', '.*miragonl.*' : '(txtsearch|qry)=', '.*findarticles.*' : 'key='} + diff --git a/conf.py b/conf.py index 574c47d..6d9af11 100644 --- a/conf.py +++ b/conf.py @@ -8,12 +8,17 @@ time_format = '%d/%b/%Y:%H:%M:%S +0100' analyzed_filename = 'access.log' +domain_name = 'soutade.fr' + +display_visitor_ip = True + DB_ROOT = './output/' DISPLAY_ROOT = './output/' pre_analysis_hooks = ['page_to_hit', 'robots'] -post_analysis_hooks = ['top_visitors', 'reverse_dns'] -display_hooks = ['top_visitors', 'all_visits'] +post_analysis_hooks = ['top_visitors'] +# post_analysis_hooks = ['top_visitors', 'reverse_dns'] +display_hooks = ['top_visitors', 'all_visits', 'referers'] reverse_dns_timeout = 0.2 page_to_hit_conf = [r'^.+/logo/$'] diff --git a/default_conf.py b/default_conf.py index 074a9e6..1b4c62b 100644 --- a/default_conf.py +++ b/default_conf.py @@ -21,4 +21,4 @@ post_analysis_hooks = [] display_hooks = [] pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] -viewed_http_codes = [200] +viewed_http_codes = [200, 304] diff --git a/iwla.py b/iwla.py index b321620..1a3cf0f 100755 --- a/iwla.py +++ b/iwla.py @@ -34,7 +34,7 @@ class IWLA(object): self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') self.log_re = re.compile(self.log_format_extracted) - self.uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') + self.uri_re = re.compile(r'(?P[^\?]+)(\?(?P.+))?') self.plugins = {conf.PRE_HOOK_DIRECTORY : conf.pre_analysis_hooks, conf.POST_HOOK_DIRECTORY : conf.post_analysis_hooks, conf.DISPLAY_HOOK_DIRECTORY : conf.display_hooks} @@ -143,9 +143,9 @@ class IWLA(object): hit['is_page'] = self.isPage(uri) - # Don't count 3xx status status = int(hit['status']) - if status >= 300 and status < 400: return + if status not in conf.viewed_http_codes: + return if super_hit['robot'] or\ not status in conf.viewed_http_codes: @@ -163,6 +163,7 @@ class IWLA(object): def _createVisitor(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] + super_hit['remote_ip'] = hit['remote_addr'] super_hit['viewed_pages'] = 0 super_hit['viewed_hits'] = 0 super_hit['not_viewed_pages'] = 0 @@ -191,9 +192,10 @@ class IWLA(object): print "Bad request extraction " + hit['request'] return False - referer_groups = self.uri_re.match(hit['http_referer']) - if referer_groups: - referer = hit['extract_referer'] = referer_groups.groupdict() + if hit['http_referer']: + referer_groups = self.uri_re.match(hit['http_referer']) + if referer_groups: + hit['extract_referer'] = referer_groups.groupdict() return True def _decodeTime(self, hit): @@ -229,13 +231,13 @@ class IWLA(object): nb_days = len(keys) row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] if nb_days: - average_row = map(lambda(v): str(int(v/nb_days)), row) + average_row = map(lambda(v): int(v/nb_days), row) else: - average_row = map(lambda(v): '0', row) + average_row = map(lambda(v): 0, row) average_row[0] = 'Average' - average_row[4] = bytesToStr(row[4]) - average_row[5] = bytesToStr(row[5]) + average_row[4] = bytesToStr(average_row[4]) + average_row[5] = bytesToStr(average_row[5]) days.appendRow(average_row) row[0] = 'Total' diff --git a/iwla_convert.pl b/iwla_convert.pl index a47140a..5a74cf8 100755 --- a/iwla_convert.pl +++ b/iwla_convert.pl @@ -1,34 +1,79 @@ #!/usr/bin/perl -my $awstats_lib_root = '/usr/share/awstats/lib/'; -my @awstats_libs = ('browsers.pm', 'browsers_phone.pm', 'mime.pm', 'referer_spam.pm', 'search_engines.pm', 'operating_systems.pm', 'robots.pm', 'worms.pm'); +my $awstats_lib_root = './'; +my @awstats_libs = ('search_engines.pm', 'robots.pm'); + +# my $awstats_lib_root = '/usr/share/awstats/lib/'; +# my @awstats_libs = ('browsers.pm', 'browsers_phone.pm', 'mime.pm', 'referer_spam.pm', 'search_engines.pm', 'operating_systems.pm', 'robots.pm', 'worms.pm'); foreach $lib (@awstats_libs) {require $awstats_lib_root . $lib;} -open($FIC,">", "robots.py") or die $!; +sub dumpList { + my @list = @{$_[0]}; + my $FIC = $_[1]; + my $first = $_[2]; -print $FIC "awstats_robots = ["; -$first = 0; -foreach $r (@RobotsSearchIDOrder_list1) -{ - $r =~ s/\'/\\\'/g; - if ($first != 0) + foreach $r (@list) { - print $FIC ", "; + $r =~ s/\'/\\\'/g; + if ($first == 0) + { + print $FIC ", "; + } + else + { + $first = 0; + } + print $FIC "'.*$r.*'"; } - else - { - $first = 1; +} + +sub dumpHash { + my %hash = %{$_[0]}; + my $FIC = $_[1]; + my $first = $_[2]; + + while( my ($k,$v) = each(%hash) ) { + $k =~ s/\'/\\\'/g; + $v =~ s/\'/\\\'/g; + if ($first == 0) + { + print $FIC ", "; + } + else + { + $first = 0; + } + print $FIC "'.*$k.*' : '$v'"; } - print $FIC "'.*$r.*'"; -} -foreach $r (@RobotsSearchIDOrder_list2) -{ - $r =~ s/\'/\\\'/g; - print $FIC ", '.*$r.*'"; } + +# Robots +open($FIC,">", "awstats_data.py") or die $!; + +print $FIC "robots = ["; +dumpList(\@RobotsSearchIDOrder_list1, $FIC, 1); +dumpList(\@RobotsSearchIDOrder_list2, $FIC, 0); print $FIC "]\n\n"; +print $FIC "search_engines = ["; +dumpList(\@SearchEnginesSearchIDOrder_list1, $FIC, 1); +print $FIC "]\n\n"; + +print $FIC "search_engines_2 = ["; +dumpList(\@SearchEnginesSearchIDOrder_list2, $FIC, 1); +print $FIC "]\n\n"; + +print $FIC "not_search_engines_keys = {"; +dumpHash(\%NotSearchEnginesKeys, $FIC, 1); +print $FIC "}\n\n"; + +print $FIC "search_engines_hashid = {"; +dumpHash(\%SearchEnginesHashID, $FIC, 1); +print $FIC "}\n\n"; + +print $FIC "search_engines_knwown_url = {"; +dumpHash(\%SearchEnginesKnownUrl, $FIC, 1); +print $FIC "}\n\n"; + close($FIC); - - diff --git a/plugins/display/all_visits.py b/plugins/display/all_visits.py index 28a0534..131c754 100644 --- a/plugins/display/all_visits.py +++ b/plugins/display/all_visits.py @@ -14,7 +14,7 @@ class IWLADisplayAllVisits(IPlugin): last_access = sorted(hits.values(), key=lambda t: t['last_access'], reverse=True) cur_time = self.iwla.getCurTime() - title = time.strftime('All visits %B %Y', cur_time) + title = time.strftime('All visits - %B %Y', cur_time) filename = 'all_visits_%d.html' % (cur_time.tm_mon) path = '%d/%s' % (cur_time.tm_year, filename) @@ -22,8 +22,13 @@ class IWLADisplayAllVisits(IPlugin): page = DisplayHTMLPage(title, path) table = DisplayHTMLBlockTable('Last seen', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) for super_hit in last_access: + address = super_hit['remote_addr'] + if self.iwla.getConfValue('display_visitor_ip', False) and\ + super_hit.get('dns_name_replaced', False): + address = '%s [%s]' % (address, super_hit['remote_ip']) + row = [ - super_hit['remote_addr'], + address, super_hit['viewed_pages'], super_hit['viewed_hits'], bytesToStr(super_hit['bandwidth']), diff --git a/plugins/display/referers.py b/plugins/display/referers.py new file mode 100644 index 0000000..56924d9 --- /dev/null +++ b/plugins/display/referers.py @@ -0,0 +1,199 @@ +import time +import re +import HTMLParser + +from iwla import IWLA +from iplugin import IPlugin +from display import * + +import awstats_data + +class IWLADisplayReferers(IPlugin): + def __init__(self, iwla): + super(IWLADisplayReferers, self).__init__(iwla) + self.API_VERSION = 1 + + def load(self): + domain_name = self.iwla.getConfValue('domain_name', '') + + if not domain_name: + print 'domain_name required in conf' + return False + + self.own_domain_re = re.compile('.*%s.*' % (domain_name)) + self.search_engines = {} + + for engine in awstats_data.search_engines: + self.search_engines[engine] = { + 're' : re.compile(engine, re.IGNORECASE) + } + + for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): + if not engine in self.search_engines: continue + self.search_engines[engine]['not_search_engine'] = \ + re.compile(not_engine, re.IGNORECASE) + + for (engine, name) in awstats_data.search_engines_hashid.items(): + if not engine in self.search_engines: continue + self.search_engines[engine]['name'] = name + + for (engine, knwown_url) in awstats_data.search_engines_knwown_url.items(): + engine = engin[2:-2] + if not engine in self.search_engines: continue + print knwown_url + self.search_engines[engine]['known_url'] = re.compile(known_url + '(?P.+)') + + + self.html_parser = HTMLParser.HTMLParser() + + return True + + def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases): + if not parameters or not key_phrase_re: return + + + for p in parameters.split('&'): + groups = key_phrase_re.match(p) + if groups: + print groups.groupddict() + key_phrase = self.html_parser.unescape(groups.groupddict()['key_phrase']).lower() + if not key_phrase in key_phrases.keys(): + key_phrases[key_phrase] = 1 + else: + key_phrases[key_phrase] += 1 + + def hook(self, iwla): + stats = iwla.getCurrentVisists() + referers = {} + robots_referers = {} + search_engine_referers = {} + key_phrases = {} + + for (k, super_hit) in stats.items(): + for r in super_hit['requests']: + if not r['http_referer']: continue + + uri = r['extract_referer']['extract_uri'] + is_search_engine = False + + if self.own_domain_re.match(uri): continue + + for e in self.search_engines.values(): + if e['re'].match(uri): + not_engine = e.get('not_search_engine', None) + # Try not engine + if not_engine and not_engine.match(uri): break + is_search_engine = True + uri = e['name'] + + parameters = r['extract_referer'].get('extract_parameters', None) + key_phrase_re = e.get('known_url', None) + + print parameters + print key_phrase_re + + self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) + + break + + if is_search_engine: + dictionary = search_engine_referers + elif super_hit['robot']: + dictionary = robots_referers + # print '%s => %s' % (uri, super_hit['remote_ip']) + else: + dictionary = referers + if r['is_page']: + key = 'pages' + else: + key = 'hits' + if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0} + dictionary[uri][key] += 1 + + top_referers = [(k, referers[k]['pages']) for k in referers.keys()] + top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True) + + top_robots_referers = [(k, robots_referers[k]['pages']) for k in robots_referers.keys()] + top_robots_referers = sorted(top_robots_referers, key=lambda t: t[1], reverse=True) + + top_search_engine_referers = [(k, search_engine_referers[k]['pages']) for k in search_engine_referers.keys()] + top_search_engine_referers = sorted(top_search_engine_referers, key=lambda t: t[1], reverse=True) + + top_key_phrases = key_phrases.items() + top_key_phrases = sorted(top_key_phrases, key=lambda t: t[1], reverse=True) + + # Top referers in index + index = self.iwla.getDisplayIndex() + + table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits']) + table.appendRow(['Search Engine', '', '']) + for r,_ in top_search_engine_referers[:10]: + row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']] + table.appendRow(row) + + table.appendRow(['External URL', '', '']) + for r,_ in top_referers[:10]: + row = [r, referers[r]['pages'], referers[r]['hits']] + table.appendRow(row) + + table.appendRow(['External URL (robot)', '', '']) + for r,_ in top_robots_referers[:10]: + row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']] + table.appendRow(row) + + index.appendBlock(table) + + # All referers in a file + cur_time = self.iwla.getCurTime() + title = time.strftime('Connexion from - %B %Y', cur_time) + + filename = 'referers_%d.html' % (cur_time.tm_mon) + path = '%d/%s' % (cur_time.tm_year, filename) + + page = DisplayHTMLPage(title, path) + table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits']) + + table.appendRow(['Search Engine', '', '']) + for r,_ in top_search_engine_referers: + row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']] + table.appendRow(row) + + table.appendRow(['External URL', '', '']) + for r,_ in top_referers: + row = [r, referers[r]['pages'], referers[r]['hits']] + table.appendRow(row) + + table.appendRow(['External URL (robot)', '', '']) + for r,_ in top_robots_referers: + row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']] + table.appendRow(row) + + page.appendBlock(table) + + display = self.iwla.getDisplay() + display.addPage(page) + + block = DisplayHTMLRawBlock() + block.setRawHTML('All referers' % (filename)) + index.appendBlock(block) + + # Top key phrases in index + table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search']) + for phrase in top_key_phrases[:10]: + table.appendRow([phrase[0], phrase[1]]) + index.appendBlock(table) + + # All key phrases in a file + cur_time = self.iwla.getCurTime() + title = time.strftime('Key Phrases - %B %Y', cur_time) + + filename = 'key_phrases_%d.html' % (cur_time.tm_mon) + path = '%d/%s' % (cur_time.tm_year, filename) + + page = DisplayHTMLPage(title, path) + table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search']) + for phrase in top_key_phrases: + table.appendRow([phrase[0], phrase[1]]) + page.appendBlock(table) + + display.addPage(page) diff --git a/plugins/display/top_visitors.py b/plugins/display/top_visitors.py index 93f455a..1959545 100644 --- a/plugins/display/top_visitors.py +++ b/plugins/display/top_visitors.py @@ -16,8 +16,13 @@ class IWLADisplayTopVisitors(IPlugin): index = iwla.getDisplayIndex() table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) for super_hit in stats['top_visitors']: + address = super_hit['remote_addr'] + if self.iwla.getConfValue('display_visitor_ip', False) and\ + super_hit.get('dns_name_replaced', False): + address = '%s [%s]' % (address, super_hit['remote_ip']) + row = [ - super_hit['remote_addr'], + address, super_hit['viewed_pages'], super_hit['viewed_hits'], bytesToStr(super_hit['bandwidth']), diff --git a/plugins/post_analysis/reverse_dns.py b/plugins/post_analysis/reverse_dns.py index 3a5210b..8898115 100644 --- a/plugins/post_analysis/reverse_dns.py +++ b/plugins/post_analysis/reverse_dns.py @@ -20,6 +20,7 @@ class IWLAPostAnalysisReverseDNS(IPlugin): try: name, _, _ = socket.gethostbyaddr(k) hit['remote_addr'] = name + hit['dns_name_replaced'] = True except: pass finally: diff --git a/plugins/post_analysis/top_visitors.py b/plugins/post_analysis/top_visitors.py index c7de05b..714ca66 100644 --- a/plugins/post_analysis/top_visitors.py +++ b/plugins/post_analysis/top_visitors.py @@ -9,7 +9,7 @@ class IWLAPostAnalysisTopVisitors(IPlugin): def hook(self, iwla): hits = iwla.getValidVisitors() stats = iwla.getMonthStats() - top_bandwidth = [(k,hits[k]['bandwidth']) for (k,v) in hits.items()] + top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()] top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) stats['top_visitors'] = [hits[h[0]] for h in top_bandwidth[:10]] diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 596552e..6211ade 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -3,7 +3,7 @@ import re from iwla import IWLA from iplugin import IPlugin -from awstats_robots_data import awstats_robots +import awstats_data class IWLAPreAnalysisRobots(IPlugin): def __init__(self, iwla): @@ -11,9 +11,7 @@ class IWLAPreAnalysisRobots(IPlugin): self.API_VERSION = 1 def load(self): - global awstats_robots - - self.awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) + self.awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_data.robots) return True @@ -32,13 +30,17 @@ class IWLAPreAnalysisRobots(IPlugin): if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday: for r in self.awstats_robots: if r.match(first_page['http_user_agent']): - super_hit['robot'] = 1 - continue + isRobot = True + break + + if isRobot: + super_hit['robot'] = 1 + continue # 1) no pages view --> robot - if not super_hit['viewed_pages']: - super_hit['robot'] = 1 - continue + # if not super_hit['viewed_pages']: + # super_hit['robot'] = 1 + # continue # 2) pages without hit --> robot if not super_hit['viewed_hits']: @@ -59,6 +61,7 @@ class IWLAPreAnalysisRobots(IPlugin): super_hit['robot'] = 1 continue - if super_hit['viewed_hits'] and not referers: + if not super_hit['viewed_pages'] and \ + (super_hit['viewed_hits'] and not referers): super_hit['robot'] = 1 continue diff --git a/search_engines.py b/search_engines.py new file mode 100644 index 0000000..3c4f8ba --- /dev/null +++ b/search_engines.py @@ -0,0 +1,8 @@ +awstats_search_engines = ['.*google\.[\w.]+/products.*', '.*base\.google\..*', '.*froogle\.google\..*', '.*groups\.google\..*', '.*images\.google\..*', '.*google\..*', '.*googlee\..*', '.*googlecom\.com.*', '.*goggle\.co\.hu.*', '.*216\.239\.(35|37|39|51)\.100.*', '.*216\.239\.(35|37|39|51)\.101.*', '.*216\.239\.5[0-9]\.104.*', '.*64\.233\.1[0-9]{2}\.104.*', '.*66\.102\.[1-9]\.104.*', '.*66\.249\.93\.104.*', '.*72\.14\.2[0-9]{2}\.104.*', '.*msn\..*', '.*live\.com.*', '.*bing\..*', '.*voila\..*', '.*mindset\.research\.yahoo.*', '.*yahoo\..*', '.*(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11).*', '.*search\.aol\.co.*', '.*tiscali\..*', '.*lycos\..*', '.*alexa\.com.*', '.*alltheweb\.com.*', '.*altavista\..*', '.*a9\.com.*', '.*dmoz\.org.*', '.*netscape\..*', '.*search\.terra\..*', '.*www\.search\.com.*', '.*search\.sli\.sympatico\.ca.*', '.*excite\..*'] + +awstats_search_engines_2 = ['.*4\-counter\.com.*', '.*att\.net.*', '.*bungeebonesdotcom.*', '.*northernlight\..*', '.*hotbot\..*', '.*kvasir\..*', '.*webcrawler\..*', '.*metacrawler\..*', '.*go2net\.com.*', '.*(^|\.)go\.com.*', '.*euroseek\..*', '.*looksmart\..*', '.*spray\..*', '.*nbci\.com\/search.*', '.*de\.ask.\com.*', '.*es\.ask.\com.*', '.*fr\.ask.\com.*', '.*it\.ask.\com.*', '.*nl\.ask.\com.*', '.*uk\.ask.\com.*', '.*(^|\.)ask\.com.*', '.*atomz\..*', '.*overture\.com.*', '.*teoma\..*', '.*findarticles\.com.*', '.*infospace\.com.*', '.*mamma\..*', '.*dejanews\..*', '.*dogpile\.com.*', '.*wisenut\.com.*', '.*ixquick\.com.*', '.*search\.earthlink\.net.*', '.*i-une\.com.*', '.*blingo\.com.*', '.*centraldatabase\.org.*', '.*clusty\.com.*', '.*mysearch\..*', '.*vivisimo\.com.*', '.*kartoo\.com.*', '.*icerocket\.com.*', '.*sphere\.com.*', '.*ledix\.net.*', '.*start\.shaw\.ca.*', '.*searchalot\.com.*', '.*copernic\.com.*', '.*avantfind\.com.*', '.*steadysearch\.com.*', '.*steady-search\.com.*', '.*chello\.at.*', '.*chello\.be.*', '.*chello\.cz.*', '.*chello\.fr.*', '.*chello\.hu.*', '.*chello\.nl.*', '.*chello\.no.*', '.*chello\.pl.*', '.*chello\.se.*', '.*chello\.sk.*', '.*chello.*', '.*mirago\.be.*', '.*mirago\.ch.*', '.*mirago\.de.*', '.*mirago\.dk.*', '.*es\.mirago\.com.*', '.*mirago\.fr.*', '.*mirago\.it.*', '.*mirago\.nl.*', '.*no\.mirago\.com.*', '.*mirago\.se.*', '.*mirago\.co\.uk.*', '.*mirago.*', '.*answerbus\.com.*', '.*icq\.com\/search.*', '.*nusearch\.com.*', '.*goodsearch\.com.*', '.*scroogle\.org.*', '.*questionanswering\.com.*', '.*mywebsearch\.com.*', '.*as\.starware\.com.*', '.*del\.icio\.us.*', '.*digg\.com.*', '.*stumbleupon\.com.*', '.*swik\.net.*', '.*segnalo\.alice\.it.*', '.*ineffabile\.it.*', '.*anzwers\.com\.au.*', '.*engine\.exe.*', '.*miner\.bol\.com\.br.*', '.*\.baidu\.com.*', '.*\.vnet\.cn.*', '.*\.soso\.com.*', '.*\.sogou\.com.*', '.*\.3721\.com.*', '.*iask\.com.*', '.*\.accoona\.com.*', '.*\.163\.com.*', '.*\.zhongsou\.com.*', '.*atlas\.cz.*', '.*seznam\.cz.*', '.*quick\.cz.*', '.*centrum\.cz.*', '.*jyxo\.(cz|com).*', '.*najdi\.to.*', '.*redbox\.cz.*', '.*opasia\.dk.*', '.*danielsen\.com.*', '.*sol\.dk.*', '.*jubii\.dk.*', '.*find\.dk.*', '.*edderkoppen\.dk.*', '.*netstjernen\.dk.*', '.*orbis\.dk.*', '.*tyfon\.dk.*', '.*1klik\.dk.*', '.*ofir\.dk.*', '.*ilse\..*', '.*vindex\..*', '.*(^|\.)ask\.co\.uk.*', '.*bbc\.co\.uk/cgi-bin/search.*', '.*ifind\.freeserve.*', '.*looksmart\.co\.uk.*', '.*splut\..*', '.*spotjockey\..*', '.*ukdirectory\..*', '.*ukindex\.co\.uk.*', '.*ukplus\..*', '.*searchy\.co\.uk.*', '.*haku\.www\.fi.*', '.*recherche\.aol\.fr.*', '.*ctrouve\..*', '.*francite\..*', '.*\.lbb\.org.*', '.*rechercher\.libertysurf\.fr.*', '.*search[\w\-]+\.free\.fr.*', '.*recherche\.club-internet\.fr.*', '.*toile\.com.*', '.*biglotron\.com.*', '.*mozbot\.fr.*', '.*sucheaol\.aol\.de.*', '.*fireball\.de.*', '.*infoseek\.de.*', '.*suche\d?\.web\.de.*', '.*[a-z]serv\.rrzn\.uni-hannover\.de.*', '.*suchen\.abacho\.de.*', '.*(brisbane|suche)\.t-online\.de.*', '.*allesklar\.de.*', '.*meinestadt\.de.*', '.*212\.227\.33\.241.*', '.*(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42).*', '.*wwweasel\.de.*', '.*netluchs\.de.*', '.*schoenerbrausen\.de.*', '.*heureka\.hu.*', '.*vizsla\.origo\.hu.*', '.*lapkereso\.hu.*', '.*goliat\.hu.*', '.*index\.hu.*', '.*wahoo\.hu.*', '.*webmania\.hu.*', '.*search\.internetto\.hu.*', '.*tango\.hu.*', '.*keresolap\.hu.*', '.*polymeta\.hu.*', '.*sify\.com.*', '.*virgilio\.it.*', '.*arianna\.libero\.it.*', '.*supereva\.com.*', '.*kataweb\.it.*', '.*search\.alice\.it\.master.*', '.*search\.alice\.it.*', '.*gotuneed\.com.*', '.*godado.*', '.*jumpy\.it.*', '.*shinyseek\.it.*', '.*teecno\.it.*', '.*ask\.jp.*', '.*sagool\.jp.*', '.*sok\.start\.no.*', '.*eniro\.no.*', '.*szukaj\.wp\.pl.*', '.*szukaj\.onet\.pl.*', '.*dodaj\.pl.*', '.*gazeta\.pl.*', '.*gery\.pl.*', '.*hoga\.pl.*', '.*netsprint\.pl.*', '.*interia\.pl.*', '.*katalog\.onet\.pl.*', '.*o2\.pl.*', '.*polska\.pl.*', '.*szukacz\.pl.*', '.*wow\.pl.*', '.*ya(ndex)?\.ru.*', '.*aport\.ru.*', '.*rambler\.ru.*', '.*turtle\.ru.*', '.*metabot\.ru.*', '.*evreka\.passagen\.se.*', '.*eniro\.se.*', '.*zoznam\.sk.*', '.*sapo\.pt.*', '.*search\.ch.*', '.*search\.bluewin\.ch.*', '.*pogodak\..*'] + +awstats_not_search_engines_keys = {'.*yahoo\..*' : '(?:picks|mail)\.yahoo\.|yahoo\.[^/]+/picks', '.*altavista\..*' : 'babelfish\.altavista\.', '.*tiscali\..*' : 'mail\.tiscali\.', '.*yandex\..*' : 'direct\.yandex\.', '.*google\..*' : 'translate\.google\.', '.*msn\..*' : 'hotmail\.msn\.'} + +awstats_search_engines_hashid = {'.*search\.sli\.sympatico\.ca.*' : 'sympatico', '.*mywebsearch\.com.*' : 'mywebsearch', '.*netsprint\.pl\/hoga\-search.*' : 'hogapl', '.*findarticles\.com.*' : 'findarticles', '.*wow\.pl.*' : 'wowpl', '.*allesklar\.de.*' : 'allesklar', '.*atomz\..*' : 'atomz', '.*bing\..*' : 'bing', '.*find\.dk.*' : 'finddk', '.*google\..*' : 'google', '.*(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11).*' : 'yahoo', '.*pogodak\..*' : 'pogodak', '.*ask\.jp.*' : 'askjp', '.*\.baidu\.com.*' : 'baidu', '.*tango\.hu.*' : 'tango_hu', '.*gotuneed\.com.*' : 'gotuneed', '.*quick\.cz.*' : 'quick', '.*mirago.*' : 'mirago', '.*szukaj\.wp\.pl.*' : 'wp', '.*mirago\.de.*' : 'miragode', '.*mirago\.dk.*' : 'miragodk', '.*katalog\.onet\.pl.*' : 'katalogonetpl', '.*googlee\..*' : 'google', '.*orbis\.dk.*' : 'orbis', '.*turtle\.ru.*' : 'turtle', '.*zoznam\.sk.*' : 'zoznam', '.*start\.shaw\.ca.*' : 'shawca', '.*chello\.at.*' : 'chelloat', '.*centraldatabase\.org.*' : 'centraldatabase', '.*centrum\.cz.*' : 'centrum', '.*kataweb\.it.*' : 'kataweb', '.*\.lbb\.org.*' : 'lbb', '.*blingo\.com.*' : 'blingo', '.*vivisimo\.com.*' : 'vivisimo', '.*stumbleupon\.com.*' : 'stumbleupon', '.*es\.ask.\com.*' : 'askes', '.*interia\.pl.*' : 'interiapl', '.*[a-z]serv\.rrzn\.uni-hannover\.de.*' : 'meta', '.*search\.alice\.it.*' : 'aliceit', '.*shinyseek\.it.*' : 'shinyseek\.it', '.*i-une\.com.*' : 'iune', '.*dejanews\..*' : 'dejanews', '.*opasia\.dk.*' : 'opasia', '.*chello\.cz.*' : 'chellocz', '.*ya(ndex)?\.ru.*' : 'yandex', '.*kartoo\.com.*' : 'kartoo', '.*arianna\.libero\.it.*' : 'arianna', '.*ofir\.dk.*' : 'ofir', '.*search\.earthlink\.net.*' : 'earthlink', '.*biglotron\.com.*' : 'biglotron', '.*lapkereso\.hu.*' : 'lapkereso', '.*216\.239\.(35|37|39|51)\.101.*' : 'google_cache', '.*miner\.bol\.com\.br.*' : 'miner', '.*dodaj\.pl.*' : 'dodajpl', '.*mirago\.be.*' : 'miragobe', '.*googlecom\.com.*' : 'google', '.*steadysearch\.com.*' : 'steadysearch', '.*redbox\.cz.*' : 'redbox', '.*haku\.www\.fi.*' : 'haku', '.*sapo\.pt.*' : 'sapo', '.*sphere\.com.*' : 'sphere', '.*danielsen\.com.*' : 'danielsen', '.*alexa\.com.*' : 'alexa', '.*mamma\..*' : 'mamma', '.*swik\.net.*' : 'swik', '.*polska\.pl.*' : 'polskapl', '.*groups\.google\..*' : 'google_groups', '.*metabot\.ru.*' : 'metabot', '.*rechercher\.libertysurf\.fr.*' : 'libertysurf', '.*szukaj\.onet\.pl.*' : 'onetpl', '.*aport\.ru.*' : 'aport', '.*de\.ask.\com.*' : 'askde', '.*splut\..*' : 'splut', '.*live\.com.*' : 'live', '.*216\.239\.5[0-9]\.104.*' : 'google_cache', '.*mysearch\..*' : 'mysearch', '.*ukplus\..*' : 'ukplus', '.*najdi\.to.*' : 'najdi', '.*overture\.com.*' : 'overture', '.*iask\.com.*' : 'iask', '.*nl\.ask.\com.*' : 'asknl', '.*nbci\.com\/search.*' : 'nbci', '.*search\.aol\.co.*' : 'aol', '.*eniro\.se.*' : 'enirose', '.*64\.233\.1[0-9]{2}\.104.*' : 'google_cache', '.*mirago\.ch.*' : 'miragoch', '.*altavista\..*' : 'altavista', '.*chello\.hu.*' : 'chellohu', '.*mozbot\.fr.*' : 'mozbot', '.*northernlight\..*' : 'northernlight', '.*mirago\.co\.uk.*' : 'miragocouk', '.*search[\w\-]+\.free\.fr.*' : 'free', '.*mindset\.research\.yahoo.*' : 'yahoo_mindset', '.*copernic\.com.*' : 'copernic', '.*heureka\.hu.*' : 'heureka', '.*steady-search\.com.*' : 'steadysearch', '.*teecno\.it.*' : 'teecnoit', '.*voila\..*' : 'voila', '.*netstjernen\.dk.*' : 'netstjernen', '.*keresolap\.hu.*' : 'keresolap_hu', '.*yahoo\..*' : 'yahoo', '.*icerocket\.com.*' : 'icerocket', '.*alltheweb\.com.*' : 'alltheweb', '.*www\.search\.com.*' : 'search.com', '.*digg\.com.*' : 'digg', '.*tiscali\..*' : 'tiscali', '.*spotjockey\..*' : 'spotjockey', '.*a9\.com.*' : 'a9', '.*(brisbane|suche)\.t-online\.de.*' : 't-online', '.*ifind\.freeserve.*' : 'freeserve', '.*att\.net.*' : 'att', '.*mirago\.it.*' : 'miragoit', '.*index\.hu.*' : 'indexhu', '.*\.sogou\.com.*' : 'sogou', '.*no\.mirago\.com.*' : 'miragono', '.*ineffabile\.it.*' : 'ineffabile', '.*netluchs\.de.*' : 'netluchs', '.*toile\.com.*' : 'toile', '.*search\..*\.\w+.*' : 'search', '.*del\.icio\.us.*' : 'delicious', '.*vizsla\.origo\.hu.*' : 'origo', '.*netscape\..*' : 'netscape', '.*dogpile\.com.*' : 'dogpile', '.*anzwers\.com\.au.*' : 'anzwers', '.*\.zhongsou\.com.*' : 'zhongsou', '.*ctrouve\..*' : 'ctrouve', '.*gazeta\.pl.*' : 'gazetapl', '.*recherche\.club-internet\.fr.*' : 'clubinternet', '.*sok\.start\.no.*' : 'start', '.*scroogle\.org.*' : 'scroogle', '.*schoenerbrausen\.de.*' : 'schoenerbrausen', '.*looksmart\.co\.uk.*' : 'looksmartuk', '.*wwweasel\.de.*' : 'wwweasel', '.*godado.*' : 'godado', '.*216\.239\.(35|37|39|51)\.100.*' : 'google_cache', '.*jubii\.dk.*' : 'jubii', '.*212\.227\.33\.241.*' : 'metaspinner', '.*mirago\.fr.*' : 'miragofr', '.*sol\.dk.*' : 'sol', '.*bbc\.co\.uk/cgi-bin/search.*' : 'bbc', '.*jumpy\.it.*' : 'jumpy\.it', '.*francite\..*' : 'francite', '.*infoseek\.de.*' : 'infoseek', '.*es\.mirago\.com.*' : 'miragoes', '.*jyxo\.(cz|com).*' : 'jyxo', '.*hotbot\..*' : 'hotbot', '.*engine\.exe.*' : 'engine', '.*(^|\.)ask\.com.*' : 'ask', '.*goliat\.hu.*' : 'goliat', '.*wisenut\.com.*' : 'wisenut', '.*mirago\.nl.*' : 'miragonl', '.*base\.google\..*' : 'google_base', '.*search\.bluewin\.ch.*' : 'bluewin', '.*lycos\..*' : 'lycos', '.*meinestadt\.de.*' : 'meinestadt', '.*4\-counter\.com.*' : 'google4counter', '.*search\.alice\.it\.master.*' : 'aliceitmaster', '.*teoma\..*' : 'teoma', '.*(^|\.)ask\.co\.uk.*' : 'askuk', '.*tyfon\.dk.*' : 'tyfon', '.*froogle\.google\..*' : 'google_froogle', '.*ukdirectory\..*' : 'ukdirectory', '.*ledix\.net.*' : 'ledix', '.*edderkoppen\.dk.*' : 'edderkoppen', '.*recherche\.aol\.fr.*' : 'aolfr', '.*google\.[\w.]+/products.*' : 'google_products', '.*webmania\.hu.*' : 'webmania', '.*searchy\.co\.uk.*' : 'searchy', '.*fr\.ask.\com.*' : 'askfr', '.*spray\..*' : 'spray', '.*72\.14\.2[0-9]{2}\.104.*' : 'google_cache', '.*eniro\.no.*' : 'eniro', '.*goodsearch\.com.*' : 'goodsearch', '.*kvasir\..*' : 'kvasir', '.*\.accoona\.com.*' : 'accoona', '.*\.soso\.com.*' : 'soso', '.*as\.starware\.com.*' : 'comettoolbar', '.*virgilio\.it.*' : 'virgilio', '.*o2\.pl.*' : 'o2pl', '.*chello\.nl.*' : 'chellonl', '.*chello\.be.*' : 'chellobe', '.*icq\.com\/search.*' : 'icq', '.*msn\..*' : 'msn', '.*fireball\.de.*' : 'fireball', '.*sucheaol\.aol\.de.*' : 'aolde', '.*uk\.ask.\com.*' : 'askuk', '.*euroseek\..*' : 'euroseek', '.*gery\.pl.*' : 'gerypl', '.*chello\.fr.*' : 'chellofr', '.*netsprint\.pl.*' : 'netsprintpl', '.*avantfind\.com.*' : 'avantfind', '.*supereva\.com.*' : 'supereva', '.*polymeta\.hu.*' : 'polymeta_hu', '.*infospace\.com.*' : 'infospace', '.*sify\.com.*' : 'sify', '.*go2net\.com.*' : 'go2net', '.*wahoo\.hu.*' : 'wahoo', '.*suche\d?\.web\.de.*' : 'webde', '.*(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42).*' : 'metacrawler_de', '.*\.3721\.com.*' : '3721', '.*ilse\..*' : 'ilse', '.*metacrawler\..*' : 'metacrawler', '.*sagool\.jp.*' : 'sagool', '.*atlas\.cz.*' : 'atlas', '.*vindex\..*' : 'vindex', '.*ixquick\.com.*' : 'ixquick', '.*66\.102\.[1-9]\.104.*' : 'google_cache', '.*rambler\.ru.*' : 'rambler', '.*answerbus\.com.*' : 'answerbus', '.*evreka\.passagen\.se.*' : 'passagen', '.*chello\.se.*' : 'chellose', '.*clusty\.com.*' : 'clusty', '.*search\.ch.*' : 'searchch', '.*chello\.no.*' : 'chellono', '.*searchalot\.com.*' : 'searchalot', '.*questionanswering\.com.*' : 'questionanswering', '.*seznam\.cz.*' : 'seznam', '.*ukindex\.co\.uk.*' : 'ukindex', '.*dmoz\.org.*' : 'dmoz', '.*excite\..*' : 'excite', '.*chello\.pl.*' : 'chellopl', '.*looksmart\..*' : 'looksmart', '.*1klik\.dk.*' : '1klik', '.*\.vnet\.cn.*' : 'vnet', '.*chello\.sk.*' : 'chellosk', '.*(^|\.)go\.com.*' : 'go', '.*nusearch\.com.*' : 'nusearch', '.*it\.ask.\com.*' : 'askit', '.*bungeebonesdotcom.*' : 'bungeebonesdotcom', '.*search\.terra\..*' : 'terra', '.*webcrawler\..*' : 'webcrawler', '.*suchen\.abacho\.de.*' : 'abacho', '.*szukacz\.pl.*' : 'szukaczpl', '.*66\.249\.93\.104.*' : 'google_cache', '.*search\.internetto\.hu.*' : 'internetto', '.*goggle\.co\.hu.*' : 'google', '.*mirago\.se.*' : 'miragose', '.*images\.google\..*' : 'google_image', '.*segnalo\.alice\.it.*' : 'segnalo', '.*\.163\.com.*' : 'netease', '.*chello.*' : 'chellocom'} +