[cig-commits] r16272 - cs/stats/trunk

sue at geodynamics.org sue at geodynamics.org
Wed Feb 17 06:42:20 PST 2010


Author: sue
Date: 2010-02-17 06:42:19 -0800 (Wed, 17 Feb 2010)
New Revision: 16272

Modified:
   cs/stats/trunk/get_all.cxx
Log:
Reliably remove all robots by not counting any IP's that look at robot.txt

Modified: cs/stats/trunk/get_all.cxx
===================================================================
--- cs/stats/trunk/get_all.cxx	2010-02-17 01:46:23 UTC (rev 16271)
+++ cs/stats/trunk/get_all.cxx	2010-02-17 14:42:19 UTC (rev 16272)
@@ -14,6 +14,7 @@
 {
   map<string,set<string> > hits;
   vector<string> remove_regex, months, years;
+  set<string> robots;
   remove_regex.push_back("bot");
   remove_regex.push_back("spider");
   remove_regex.push_back("Yahoo! Slurp");
@@ -244,16 +245,16 @@
       i!=boost::filesystem::directory_iterator(); ++i)
     {
       if(i->leaf().substr(0,10)=="access.log")
-	logs.push_back(*i);
+        logs.push_back(*i);
     }
 
   /* Add all of the Zope logs */
   for(fs::directory_iterator
-	i("/var/lib/zope2.9/instance/cig/log/");
+        i("/var/lib/zope2.9/instance/cig/log/");
       i!=boost::filesystem::directory_iterator(); ++i)
     {
       if(i->leaf().substr(0,2)=="Z2")
-	logs.push_back(*i);
+        logs.push_back(*i);
     }
 
   for(list<fs::path>::iterator log_path=logs.begin(); log_path!=logs.end();
@@ -286,24 +287,33 @@
 	  log.getline(line,max_size);
 	  string logline(line);
 	  bool valid(true);
-	  for(vector<string>::iterator i=remove_regex.begin();
-	      i!=remove_regex.end(); ++i)
-	    {
-	      if((*i)[0]=='^')
-		{
-		  if(logline.size()>=i->size()-1
-		     && logline.substr(0,i->size()-1)==i->substr(1))
-		    {
-		      valid=false;
-		      break;
-		    }
-		}
-	      else if(logline.find(*i)!=string::npos)
-		{
-		  valid=false;
-		  break;
-		}
-	    }
+          if(logline.find("robots.txt")!=string::npos)
+            {
+              string ip_address=logline.substr(0,logline.find(" "));
+              robots.insert(ip_address);
+              valid=false;
+            }
+          else
+            {
+              for(vector<string>::iterator i=remove_regex.begin();
+                  i!=remove_regex.end(); ++i)
+                {
+                  if((*i)[0]=='^')
+                    {
+                      if(logline.size()>=i->size()-1
+                         && logline.substr(0,i->size()-1)==i->substr(1))
+                        {
+                          valid=false;
+                          break;
+                        }
+                    }
+                  else if(logline.find(*i)!=string::npos)
+                    {
+                      valid=false;
+                      break;
+                    }
+                }
+            }
 	  if(valid)
 	    for(vector<string>::iterator m=months.begin();
 		m!=months.end(); ++m)
@@ -324,6 +334,9 @@
       fs::remove(temp_file);
     }
 
+  for(set<string>::iterator i=robots.begin(); i!=robots.end(); ++i)
+    cout << "robot " << *i << "\n";
+
   for(vector<string>::iterator m=months.begin();
       m!=months.end(); ++m)
     for(vector<string>::iterator y=years.begin();
@@ -337,6 +350,7 @@
 	  if(i!=hits.end())
 	    for(set<string>::iterator j=i->second.begin();
 		j!=i->second.end(); ++j)
-	      of << *j << "\n";
+              if(robots.find(*j)==robots.end())
+                of << *j << "\n";
 	}
 }



More information about the CIG-COMMITS mailing list