View Javadoc

1   /*
2    * Copyright (c) 2003-2005, Henri Yandell
3    * All rights reserved.
4    * 
5    * Redistribution and use in source and binary forms, with or 
6    * without modification, are permitted provided that the 
7    * following conditions are met:
8    * 
9    * + Redistributions of source code must retain the above copyright notice, 
10   *   this list of conditions and the following disclaimer.
11   * 
12   * + Redistributions in binary form must reproduce the above copyright notice, 
13   *   this list of conditions and the following disclaimer in the documentation 
14   *   and/or other materials provided with the distribution.
15   * 
16   * + Neither the name of OSJava nor the names of its contributors 
17   *   may be used to endorse or promote products derived from this software 
18   *   without specific prior written permission.
19   * 
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
23   * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
30   * POSSIBILITY OF SUCH DAMAGE.
31   */
32  package org.osjava.norbert;
33  
34  import java.io.IOException;
35  import java.io.StringReader;
36  import java.io.BufferedReader;
37  import java.io.InputStreamReader;
38  import java.io.InputStream;
39  import java.net.URL;
40  import java.net.URLDecoder;
41  import java.net.MalformedURLException;
42  import java.net.HttpURLConnection;
43  import java.net.URLConnection;
44  
45  /***
46   * A Client which may be used to decide which urls on a website 
47   * may be looked at, according to the norobots specification 
48   * located at: 
49   * http://www.robotstxt.org/wc/norobots-rfc.html
50   */
51  public class NoRobotClient {
52  
53      private String userAgent;
54      private RulesEngine rules;
55      private RulesEngine wildcardRules;
56      private URL baseUrl;
57  
58      /***
59       * Create a Client for a particular user-agent name. 
60       *
61       * @param userAgent name for the robot
62       */
63      public NoRobotClient(String userAgent) {
64          this.userAgent = userAgent;
65      }
66  
67      /***
68       * Head to a website and suck in their robots.txt file. 
69       * Note that the URL passed in is for the website and does 
70       * not include the robots.txt file itself.
71       *
72       * @param baseUrl of the site
73       */
74      public void parse(URL baseUrl) throws NoRobotException {
75  
76          this.rules = new RulesEngine();
77  
78          this.baseUrl = baseUrl;
79  
80          URL txtUrl = null;
81          try {
82              // fetch baseUrl+"robots.txt"
83              txtUrl = new URL(baseUrl, "robots.txt");
84          } catch(MalformedURLException murle) {
85              throw new NoRobotException("Bad URL: "+baseUrl+", robots.txt. ", murle);
86          }
87  
88          String txt = null;
89          try {
90              txt = loadContent(txtUrl, this.userAgent);
91              if(txt == null) {
92                  throw new NoRobotException("No content found for: "+txtUrl);
93              }
94          } catch(IOException ioe) {
95              throw new NoRobotException("Unable to get content for: "+txtUrl, ioe);
96          }
97  
98          try {
99              parseText(txt);
100         } catch(NoRobotException nre) {
101             throw new NoRobotException("Problem while parsing "+txtUrl, nre);
102         }
103     }
104 
105     public void parseText(String txt) throws NoRobotException {
106         this.rules = parseTextForUserAgent(txt, this.userAgent);
107         this.wildcardRules = parseTextForUserAgent(txt, "*");
108     }
109 
110     private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws NoRobotException {
111 
112         RulesEngine engine = new RulesEngine();
113 
114         // Classic basic parser style, read an element at a time, 
115         // changing a state variable [parsingAllowBlock]
116 
117         // take each line, one at a time
118         BufferedReader rdr = new BufferedReader( new StringReader(txt) );
119         String line = "";
120         String value = null;
121         boolean parsingAllowBlock = false;
122         try {
123             while( (line = rdr.readLine()) != null ) {
124                 // trim whitespace from either side
125                 line = line.trim();
126 
127                 // ignore startsWith('#')
128                 if(line.startsWith("#")) {
129                     continue;
130                 }
131 
132                 // if User-agent == userAgent 
133                 // record the rest up until end or next User-agent
134                 // then quit (? check spec)
135                 if(line.startsWith("User-agent:")) {
136 
137                     if(parsingAllowBlock) {
138                         // we've just finished reading allows/disallows
139                         if(engine.isEmpty()) {
140                             // multiple user agents in a line, let's 
141                             // wait til we get rules
142                             continue;
143                         } else {
144                             break;
145                         }
146                     }
147 
148                     value = line.substring("User-agent:".length()).trim();
149                     if(value.equalsIgnoreCase(userAgent)) {
150                         parsingAllowBlock = true;
151                         continue;
152                     }
153                 } else {
154                     // if not, then store if we're currently the user agent
155                     if(parsingAllowBlock) {
156                         if(line.startsWith("Allow:")) {
157                             value = line.substring("Allow:".length()).trim();
158                             value = URLDecoder.decode(value);
159                             engine.allowPath( value );
160                         } else 
161                         if(line.startsWith("Disallow:")) {
162                             value = line.substring("Disallow:".length()).trim();
163                             value = URLDecoder.decode(value);
164                             engine.disallowPath( value );
165                         } else {
166                             // ignore
167                             continue;
168                         }
169                     } else {
170                         // ignore
171                         continue;
172                     }
173                 }
174             }
175         } catch (IOException ioe) {
176             // As this is parsing a String, it should not have an IOE
177             throw new NoRobotException("Problem while parsing text. ", ioe);
178         }
179 
180         return engine;
181     }
182 
183     /***
184      * Decide if the parsed website will allow this URL to be 
185      * be seen. 
186      *
187      * Note that parse(URL) must be called before this method 
188      * is called. 
189      *
190      * @param url in question
191      * @return is the url allowed?
192      *
193      * @throws IllegalStateException when parse has not been called
194      */
195     public boolean isUrlAllowed(URL url) throws IllegalStateException, IllegalArgumentException {
196         if(rules == null) {
197             throw new IllegalStateException("You must call parse before you call this method.  ");
198         }
199 
200         if( !baseUrl.getHost().equals(url.getHost()) ||
201             baseUrl.getPort() != url.getPort() ||
202             !baseUrl.getProtocol().equals(url.getProtocol()) )
203         {
204             throw new IllegalArgumentException("Illegal to use a different url, " + url.toExternalForm() + 
205                                                ",  for this robots.txt: "+this.baseUrl.toExternalForm());
206         }
207         String urlStr = url.toExternalForm().substring( this.baseUrl.toExternalForm().length() - 1);
208         if("/robots.txt".equals(urlStr)) {
209             return true;
210         }
211         urlStr = URLDecoder.decode( urlStr );
212         Boolean allowed = this.rules.isAllowed( urlStr );
213         if(allowed == null) {
214             allowed = this.wildcardRules.isAllowed( urlStr );
215         }
216         if(allowed == null) {
217             allowed = Boolean.TRUE;
218         }
219 
220         return allowed.booleanValue();
221     }
222 
223     // INLINE: as such from genjava/gj-core's net package. Simple method 
224     // stolen from Payload too.
225     private static String loadContent(URL url, String userAgent) throws IOException {
226         URLConnection urlConn = url.openConnection();
227         if(urlConn instanceof HttpURLConnection) {
228             if(userAgent != null) {
229                 ((HttpURLConnection)urlConn).addRequestProperty("User-Agent", userAgent);
230             }
231         }
232         InputStream in = urlConn.getInputStream();
233         BufferedReader rdr = new BufferedReader(new InputStreamReader(in));
234         StringBuffer buffer = new StringBuffer();
235         String line = "";
236         while( (line = rdr.readLine()) != null) {
237             buffer.append(line);
238             buffer.append("\n");
239         }
240         in.close();
241         return buffer.toString();
242     }
243 
244 }