1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32 package org.osjava.norbert;
33
34 import java.io.IOException;
35 import java.io.StringReader;
36 import java.io.BufferedReader;
37 import java.io.InputStreamReader;
38 import java.io.InputStream;
39 import java.net.URL;
40 import java.net.URLDecoder;
41 import java.net.MalformedURLException;
42 import java.net.HttpURLConnection;
43 import java.net.URLConnection;
44
45 /***
46 * A Client which may be used to decide which urls on a website
47 * may be looked at, according to the norobots specification
48 * located at:
49 * http://www.robotstxt.org/wc/norobots-rfc.html
50 */
51 public class NoRobotClient {
52
53 private String userAgent;
54 private RulesEngine rules;
55 private RulesEngine wildcardRules;
56 private URL baseUrl;
57
58 /***
59 * Create a Client for a particular user-agent name.
60 *
61 * @param userAgent name for the robot
62 */
63 public NoRobotClient(String userAgent) {
64 this.userAgent = userAgent;
65 }
66
67 /***
68 * Head to a website and suck in their robots.txt file.
69 * Note that the URL passed in is for the website and does
70 * not include the robots.txt file itself.
71 *
72 * @param baseUrl of the site
73 */
74 public void parse(URL baseUrl) throws NoRobotException {
75
76 this.rules = new RulesEngine();
77
78 this.baseUrl = baseUrl;
79
80 URL txtUrl = null;
81 try {
82
83 txtUrl = new URL(baseUrl, "robots.txt");
84 } catch(MalformedURLException murle) {
85 throw new NoRobotException("Bad URL: "+baseUrl+", robots.txt. ", murle);
86 }
87
88 String txt = null;
89 try {
90 txt = loadContent(txtUrl, this.userAgent);
91 if(txt == null) {
92 throw new NoRobotException("No content found for: "+txtUrl);
93 }
94 } catch(IOException ioe) {
95 throw new NoRobotException("Unable to get content for: "+txtUrl, ioe);
96 }
97
98 try {
99 parseText(txt);
100 } catch(NoRobotException nre) {
101 throw new NoRobotException("Problem while parsing "+txtUrl, nre);
102 }
103 }
104
105 public void parseText(String txt) throws NoRobotException {
106 this.rules = parseTextForUserAgent(txt, this.userAgent);
107 this.wildcardRules = parseTextForUserAgent(txt, "*");
108 }
109
110 private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws NoRobotException {
111
112 RulesEngine engine = new RulesEngine();
113
114
115
116
117
118 BufferedReader rdr = new BufferedReader( new StringReader(txt) );
119 String line = "";
120 String value = null;
121 boolean parsingAllowBlock = false;
122 try {
123 while( (line = rdr.readLine()) != null ) {
124
125 line = line.trim();
126
127
128 if(line.startsWith("#")) {
129 continue;
130 }
131
132
133
134
135 if(line.startsWith("User-agent:")) {
136
137 if(parsingAllowBlock) {
138
139 if(engine.isEmpty()) {
140
141
142 continue;
143 } else {
144 break;
145 }
146 }
147
148 value = line.substring("User-agent:".length()).trim();
149 if(value.equalsIgnoreCase(userAgent)) {
150 parsingAllowBlock = true;
151 continue;
152 }
153 } else {
154
155 if(parsingAllowBlock) {
156 if(line.startsWith("Allow:")) {
157 value = line.substring("Allow:".length()).trim();
158 value = URLDecoder.decode(value);
159 engine.allowPath( value );
160 } else
161 if(line.startsWith("Disallow:")) {
162 value = line.substring("Disallow:".length()).trim();
163 value = URLDecoder.decode(value);
164 engine.disallowPath( value );
165 } else {
166
167 continue;
168 }
169 } else {
170
171 continue;
172 }
173 }
174 }
175 } catch (IOException ioe) {
176
177 throw new NoRobotException("Problem while parsing text. ", ioe);
178 }
179
180 return engine;
181 }
182
183 /***
184 * Decide if the parsed website will allow this URL to be
185 * be seen.
186 *
187 * Note that parse(URL) must be called before this method
188 * is called.
189 *
190 * @param url in question
191 * @return is the url allowed?
192 *
193 * @throws IllegalStateException when parse has not been called
194 */
195 public boolean isUrlAllowed(URL url) throws IllegalStateException, IllegalArgumentException {
196 if(rules == null) {
197 throw new IllegalStateException("You must call parse before you call this method. ");
198 }
199
200 if( !baseUrl.getHost().equals(url.getHost()) ||
201 baseUrl.getPort() != url.getPort() ||
202 !baseUrl.getProtocol().equals(url.getProtocol()) )
203 {
204 throw new IllegalArgumentException("Illegal to use a different url, " + url.toExternalForm() +
205 ", for this robots.txt: "+this.baseUrl.toExternalForm());
206 }
207 String urlStr = url.toExternalForm().substring( this.baseUrl.toExternalForm().length() - 1);
208 if("/robots.txt".equals(urlStr)) {
209 return true;
210 }
211 urlStr = URLDecoder.decode( urlStr );
212 Boolean allowed = this.rules.isAllowed( urlStr );
213 if(allowed == null) {
214 allowed = this.wildcardRules.isAllowed( urlStr );
215 }
216 if(allowed == null) {
217 allowed = Boolean.TRUE;
218 }
219
220 return allowed.booleanValue();
221 }
222
223
224
225 private static String loadContent(URL url, String userAgent) throws IOException {
226 URLConnection urlConn = url.openConnection();
227 if(urlConn instanceof HttpURLConnection) {
228 if(userAgent != null) {
229 ((HttpURLConnection)urlConn).addRequestProperty("User-Agent", userAgent);
230 }
231 }
232 InputStream in = urlConn.getInputStream();
233 BufferedReader rdr = new BufferedReader(new InputStreamReader(in));
234 StringBuffer buffer = new StringBuffer();
235 String line = "";
236 while( (line = rdr.readLine()) != null) {
237 buffer.append(line);
238 buffer.append("\n");
239 }
240 in.close();
241 return buffer.toString();
242 }
243
244 }