Java browser: FetchURL.java

Content of FetchURL.java extracted from spider.jar
/* 
 
class FetchURL retrieves content from a specified URL
 
used by Spider.java 
 
01-11-2006 version 0.1: initial version
 
*/
package spider;
 
import java.io.*;
import java.net.*;
import java.util.ArrayList;
 
class FetchURL extends Thread {
	final int max_concurrent = 10;  // maximum number of concurrent threads
	final boolean anytype = true; // download any type of content, not only html
 
	String urlName;
	int deepness;
	boolean anydomain;
	OutputStream f, content = null;
	HttpConnection server = null;
	String fileName=null;
 
	FetchURL(String s, int deepness, boolean anydomain) {
		urlName=s;
		this.deepness = deepness;
		this.anydomain = anydomain;
	}
 
 public synchronized void println(String s) {
	System.out.println(urlName+"--->"+s);
	if (Spider.out != null) 
		Spider.out.println("<td>"+s+"</td><td><a href=\""+fileName+"\">"+urlName+"</a></td></tr><tr><td>");
 }
 
 public synchronized void dump(String s) throws IOException {
	File tempFile = File.createTempFile("dump",null,new File(Spider.cache));
	PrintWriter outStream = new PrintWriter(new FileWriter(tempFile));
	outStream.println("Dump "+getName());
	outStream.println("Url "+urlName);
	outStream.println(s);
	outStream.close();
 }
 
// BEGIN TAG-0001
 int findUrl(int i, String ustring) {
	return minIndex(minIndex(minIndex(ustring.indexOf("<A", i),ustring.indexOf("<IMG", i)), minIndex(ustring.indexOf("<LINK", i),ustring.indexOf("<FRAME", i))),ustring.indexOf("<SCRIPT", i));
 }
 
 int minIndex(int i, int j) {
    if (i == -1)
	return (j);
    if (j == -1)
	return (i);
// now both i and j are >= 0
	if (i < j)
	return (i);
	else return (j);
 }
// END TAG-0001
 
 public void run() {
  if (Spider.TGroup == null)
  { Spider.TGroup = getThreadGroup();
  }
  boolean redirect = false;
  URI uri;File dir;
  int responseCode;
  try{
	  do {
		uri = new URI(urlName);
		if (!uri.getScheme().equals("http")) return;
		fileName = uri.getPath();
		if (fileName.length() == 0) fileName = "/";
		if (fileName.endsWith("/"))
		{ fileName += "index.htm";
		}
		dir =  new File(Spider.cache+"/"+uri.getHost());
		if (!dir.exists()) 	dir.mkdirs();
 
		if (uri.getHost()== null)
		{  dump("Assert: null host");
			return;
		}		
 
		server = new HttpConnection(urlName);
		responseCode = server.getResponseCode();
// check if redirect
		if (!redirect && (responseCode >= 300 && responseCode <= 399)) {
			urlName = server.getHeaderField("Location");
			server.close();
			redirect = true;
		} else redirect = false;
	   } while (redirect);
 
// BEGIN TAG-0001
//		fileName = dir.toString()+fileName.charAt(0)+fileName.substring(1).replace('/','.')+((uri.getRawQuery()!=null ? "^"+uri.getRawQuery().hashCode()+".html" : ""));
		fileName = dir.toString()+fileName.charAt(0)+fileName.substring(1)+((uri.getRawQuery()!=null ? "^"+uri.getRawQuery().replace('*','.').replace(':','.')+".html" : ""));
 
		//does destinations directory exist ?
		File destination = new File(fileName);
		if (destination.getParentFile() != null
            && !destination.getParentFile().exists()) {
            destination.getParentFile().mkdirs();
        }
// END TAG-0001
 
	   if (responseCode != 200) {
			println("HTTP response code: " + 
			   String.valueOf(responseCode)+" "+server.getResponseMessage());
		   PrintWriter outStream = new PrintWriter(new FileWriter(new File(fileName+".log")));
			int n = 0; String line;
		   while ((line = server.getHeaderField(n))!= null)
		   {
				outStream.println(line);
				n++;
		   }
			server.close();
			outStream.close();
			return;
	   }
 
	   if ((server.getContentType()!=null)&&(!server.getContentType().startsWith("text/html"))) {	
			if (anytype) {
				OutputStream content = new FileOutputStream(fileName);
				buffer abuf = new buffer();
				int r1, ns=abuf.size;
				DataInputStream serverIn = new DataInputStream(server.getInputStream());
				while ((r1 = abuf.readStream(serverIn)) > 0)
				{   ns+=r1;
					abuf.writeStream(content);
				}
				content.close();
				println(ns+" bytes downloaded");
				if (server.getLastModified() != 0)  new File(fileName).setLastModified(server.getLastModified());
			}
			else println ("Content type is different from text/html"); 
			server.close();
			return; 
		}
 
	   ArrayList<String> urls_to_fetch = new ArrayList<String>();
	   File target = new File(fileName);
       File tempFile = File.createTempFile("Spider",null,destination.getParentFile());
	   PrintWriter outStream = new PrintWriter(new FileWriter(tempFile));
	   String line, stemp="";
	   boolean ongoing = false;
	   String result = "downloaded";
	   Spider.stalled++;
	   try {
		  while ((line = server.readLine())!= null)
		  { 
			outStream.println(line);
			if (deepness > 0) {
				int i=0, j;
				search:
					while (ongoing || (i = findUrl(i, line.toUpperCase())) != -1)
					{   
						if ((j = line.indexOf(">", i))==-1) 
						{	stemp += line.substring(i);
							ongoing = true;
							break search;}
						stemp +=line.substring(i, j);
						int k;
	// extract the href value
	// NOTA: prima di SRC c'è un blank per gestire correttamente il parsing di un frammento del tipo: <img onMouseOver="javascript:this.src='/img/menu_2.gif'" src="/img/menu_2_off.gif">
						if ((k = minIndex(stemp.toUpperCase().indexOf("HREF"),stemp.toUpperCase().indexOf(" SRC"))) != -1)
							if ((k = stemp.indexOf("=",k)) != -1)
							{ 
								String url = stemp.substring(k+1).replace('\"',' ').trim();
								if ((k = url.indexOf(" ")) != -1) url = url.substring(0, k);
								try
								{   if ((k = url.indexOf("#")) != -1) url = url.substring(0, k);
									URL resolved = new URL(uri.toURL(),url);	
									String host = resolved.getHost();
									if (host != null) {
	// check we new link is in the same domain
										url = resolved.toString();
										if ((anydomain||host.endsWith(uri.getHost())) && Spider.visited_urls.add(url)) urls_to_fetch.add(url);
									}
								} catch (MalformedURLException ex) {
	// do nothing
								}
							}
						ongoing = false; stemp = ""; 
						i = j+1;					 
					}
			}
		  };
	   } catch (SocketException ex) {
	// do nothing
			result = ex.toString();
	   }
 
	   long webtime = server.getLastModified();
	   if (webtime == 0)
	   { webtime = System.currentTimeMillis();
	   }
	   server.close();
	   outStream.close();
	   target.delete(); 
	   tempFile.renameTo(target); 
	   target.setLastModified(webtime); 
	   println(result);
       for (int i = 0; i < urls_to_fetch.size(); i++)
		{   
			if (activeCount() > max_concurrent) yield();
			new FetchURL((String) urls_to_fetch.get(i), deepness-1, anydomain).start();
		}
	   Spider.stalled--;
 
 
//	   if (Spider.out != null) Spider.out.flush();
  }catch (URISyntaxException ex){
	   println("Bad URL");
  }catch (MalformedURLException ex){
	   println("Bad URL");
  }catch (UnknownHostException ex){
	   println("Unknown host");
  }catch (ConnectException ex){
	   println("Unable to connect");
  }catch (SocketException ex){
	   println(ex.toString());
  }catch (Exception ex){
	   println(ex.toString());ex.printStackTrace(System.err);
  }
 }
}