Content of FetchURL.java extracted from spider.jar
/* class FetchURL retrieves content from a specified URL used by Spider.java 01-11-2006 version 0.1: initial version */ package spider; import java.io.*; import java.net.*; import java.util.ArrayList; class FetchURL extends Thread { final int max_concurrent = 10; // maximum number of concurrent threads final boolean anytype = true; // download any type of content, not only html String urlName; int deepness; boolean anydomain; OutputStream f, content = null; HttpConnection server = null; String fileName=null; FetchURL(String s, int deepness, boolean anydomain) { urlName=s; this.deepness = deepness; this.anydomain = anydomain; } public synchronized void println(String s) { System.out.println(urlName+"--->"+s); if (Spider.out != null) Spider.out.println("<td>"+s+"</td><td><a href=\""+fileName+"\">"+urlName+"</a></td></tr><tr><td>"); } public synchronized void dump(String s) throws IOException { File tempFile = File.createTempFile("dump",null,new File(Spider.cache)); PrintWriter outStream = new PrintWriter(new FileWriter(tempFile)); outStream.println("Dump "+getName()); outStream.println("Url "+urlName); outStream.println(s); outStream.close(); } // BEGIN TAG-0001 int findUrl(int i, String ustring) { return minIndex(minIndex(minIndex(ustring.indexOf("<A", i),ustring.indexOf("<IMG", i)), minIndex(ustring.indexOf("<LINK", i),ustring.indexOf("<FRAME", i))),ustring.indexOf("<SCRIPT", i)); } int minIndex(int i, int j) { if (i == -1) return (j); if (j == -1) return (i); // now both i and j are >= 0 if (i < j) return (i); else return (j); } // END TAG-0001 public void run() { if (Spider.TGroup == null) { Spider.TGroup = getThreadGroup(); } boolean redirect = false; URI uri;File dir; int responseCode; try{ do { uri = new URI(urlName); if (!uri.getScheme().equals("http")) return; fileName = uri.getPath(); if (fileName.length() == 0) fileName = "/"; if (fileName.endsWith("/")) { fileName += "index.htm"; } dir = new File(Spider.cache+"/"+uri.getHost()); if (!dir.exists()) dir.mkdirs(); if (uri.getHost()== null) { dump("Assert: null host"); return; } server = new HttpConnection(urlName); responseCode = server.getResponseCode(); // check if redirect if (!redirect && (responseCode >= 300 && responseCode <= 399)) { urlName = server.getHeaderField("Location"); server.close(); redirect = true; } else redirect = false; } while (redirect); // BEGIN TAG-0001 // fileName = dir.toString()+fileName.charAt(0)+fileName.substring(1).replace('/','.')+((uri.getRawQuery()!=null ? "^"+uri.getRawQuery().hashCode()+".html" : "")); fileName = dir.toString()+fileName.charAt(0)+fileName.substring(1)+((uri.getRawQuery()!=null ? "^"+uri.getRawQuery().replace('*','.').replace(':','.')+".html" : "")); //does destinations directory exist ? File destination = new File(fileName); if (destination.getParentFile() != null && !destination.getParentFile().exists()) { destination.getParentFile().mkdirs(); } // END TAG-0001 if (responseCode != 200) { println("HTTP response code: " + String.valueOf(responseCode)+" "+server.getResponseMessage()); PrintWriter outStream = new PrintWriter(new FileWriter(new File(fileName+".log"))); int n = 0; String line; while ((line = server.getHeaderField(n))!= null) { outStream.println(line); n++; } server.close(); outStream.close(); return; } if ((server.getContentType()!=null)&&(!server.getContentType().startsWith("text/html"))) { if (anytype) { OutputStream content = new FileOutputStream(fileName); buffer abuf = new buffer(); int r1, ns=abuf.size; DataInputStream serverIn = new DataInputStream(server.getInputStream()); while ((r1 = abuf.readStream(serverIn)) > 0) { ns+=r1; abuf.writeStream(content); } content.close(); println(ns+" bytes downloaded"); if (server.getLastModified() != 0) new File(fileName).setLastModified(server.getLastModified()); } else println ("Content type is different from text/html"); server.close(); return; } ArrayList<String> urls_to_fetch = new ArrayList<String>(); File target = new File(fileName); File tempFile = File.createTempFile("Spider",null,destination.getParentFile()); PrintWriter outStream = new PrintWriter(new FileWriter(tempFile)); String line, stemp=""; boolean ongoing = false; String result = "downloaded"; Spider.stalled++; try { while ((line = server.readLine())!= null) { outStream.println(line); if (deepness > 0) { int i=0, j; search: while (ongoing || (i = findUrl(i, line.toUpperCase())) != -1) { if ((j = line.indexOf(">", i))==-1) { stemp += line.substring(i); ongoing = true; break search;} stemp +=line.substring(i, j); int k; // extract the href value // NOTA: prima di SRC c'รจ un blank per gestire correttamente il parsing di un frammento del tipo: <img onMouseOver="javascript:this.src='/img/menu_2.gif'" src="/img/menu_2_off.gif"> if ((k = minIndex(stemp.toUpperCase().indexOf("HREF"),stemp.toUpperCase().indexOf(" SRC"))) != -1) if ((k = stemp.indexOf("=",k)) != -1) { String url = stemp.substring(k+1).replace('\"',' ').trim(); if ((k = url.indexOf(" ")) != -1) url = url.substring(0, k); try { if ((k = url.indexOf("#")) != -1) url = url.substring(0, k); URL resolved = new URL(uri.toURL(),url); String host = resolved.getHost(); if (host != null) { // check we new link is in the same domain url = resolved.toString(); if ((anydomain||host.endsWith(uri.getHost())) && Spider.visited_urls.add(url)) urls_to_fetch.add(url); } } catch (MalformedURLException ex) { // do nothing } } ongoing = false; stemp = ""; i = j+1; } } }; } catch (SocketException ex) { // do nothing result = ex.toString(); } long webtime = server.getLastModified(); if (webtime == 0) { webtime = System.currentTimeMillis(); } server.close(); outStream.close(); target.delete(); tempFile.renameTo(target); target.setLastModified(webtime); println(result); for (int i = 0; i < urls_to_fetch.size(); i++) { if (activeCount() > max_concurrent) yield(); new FetchURL((String) urls_to_fetch.get(i), deepness-1, anydomain).start(); } Spider.stalled--; // if (Spider.out != null) Spider.out.flush(); }catch (URISyntaxException ex){ println("Bad URL"); }catch (MalformedURLException ex){ println("Bad URL"); }catch (UnknownHostException ex){ println("Unknown host"); }catch (ConnectException ex){ println("Unable to connect"); }catch (SocketException ex){ println(ex.toString()); }catch (Exception ex){ println(ex.toString());ex.printStackTrace(System.err); } } }