[cig-commits] r16272 - cs/stats/trunk
sue at geodynamics.org
sue at geodynamics.org
Wed Feb 17 06:42:20 PST 2010
Author: sue
Date: 2010-02-17 06:42:19 -0800 (Wed, 17 Feb 2010)
New Revision: 16272
Modified:
cs/stats/trunk/get_all.cxx
Log:
Reliably remove all robots by not counting any IP's that look at robot.txt
Modified: cs/stats/trunk/get_all.cxx
===================================================================
--- cs/stats/trunk/get_all.cxx 2010-02-17 01:46:23 UTC (rev 16271)
+++ cs/stats/trunk/get_all.cxx 2010-02-17 14:42:19 UTC (rev 16272)
@@ -14,6 +14,7 @@
{
map<string,set<string> > hits;
vector<string> remove_regex, months, years;
+ set<string> robots;
remove_regex.push_back("bot");
remove_regex.push_back("spider");
remove_regex.push_back("Yahoo! Slurp");
@@ -244,16 +245,16 @@
i!=boost::filesystem::directory_iterator(); ++i)
{
if(i->leaf().substr(0,10)=="access.log")
- logs.push_back(*i);
+ logs.push_back(*i);
}
/* Add all of the Zope logs */
for(fs::directory_iterator
- i("/var/lib/zope2.9/instance/cig/log/");
+ i("/var/lib/zope2.9/instance/cig/log/");
i!=boost::filesystem::directory_iterator(); ++i)
{
if(i->leaf().substr(0,2)=="Z2")
- logs.push_back(*i);
+ logs.push_back(*i);
}
for(list<fs::path>::iterator log_path=logs.begin(); log_path!=logs.end();
@@ -286,24 +287,33 @@
log.getline(line,max_size);
string logline(line);
bool valid(true);
- for(vector<string>::iterator i=remove_regex.begin();
- i!=remove_regex.end(); ++i)
- {
- if((*i)[0]=='^')
- {
- if(logline.size()>=i->size()-1
- && logline.substr(0,i->size()-1)==i->substr(1))
- {
- valid=false;
- break;
- }
- }
- else if(logline.find(*i)!=string::npos)
- {
- valid=false;
- break;
- }
- }
+ if(logline.find("robots.txt")!=string::npos)
+ {
+ string ip_address=logline.substr(0,logline.find(" "));
+ robots.insert(ip_address);
+ valid=false;
+ }
+ else
+ {
+ for(vector<string>::iterator i=remove_regex.begin();
+ i!=remove_regex.end(); ++i)
+ {
+ if((*i)[0]=='^')
+ {
+ if(logline.size()>=i->size()-1
+ && logline.substr(0,i->size()-1)==i->substr(1))
+ {
+ valid=false;
+ break;
+ }
+ }
+ else if(logline.find(*i)!=string::npos)
+ {
+ valid=false;
+ break;
+ }
+ }
+ }
if(valid)
for(vector<string>::iterator m=months.begin();
m!=months.end(); ++m)
@@ -324,6 +334,9 @@
fs::remove(temp_file);
}
+ for(set<string>::iterator i=robots.begin(); i!=robots.end(); ++i)
+ cout << "robot " << *i << "\n";
+
for(vector<string>::iterator m=months.begin();
m!=months.end(); ++m)
for(vector<string>::iterator y=years.begin();
@@ -337,6 +350,7 @@
if(i!=hits.end())
for(set<string>::iterator j=i->second.begin();
j!=i->second.end(); ++j)
- of << *j << "\n";
+ if(robots.find(*j)==robots.end())
+ of << *j << "\n";
}
}
More information about the CIG-COMMITS
mailing list