Java browser: Spider.java

Content of Spider.java extracted from spider.jar

/* 
The program Spider.java fetches a web page, then parse the page to get hyperlinks to other pages, again and again...
 
version 0.1: initial version
version 0.1.5: added download of not html files
version 0.2 changed the name files are named and corrected some errors, see tag TAG-0001
version 0.3 adapted to JDK 5
version 0.4: package introduced
 
 
miglioramenti da introdurre: 
- non aprire sempre una nuova connessione TCP/IP per lo stesso dominio: riusare le connessioni
- introdurre GUI basata su finestre
problemi con i link piĆ¹ lunghi di 255 caratteri: vengono generati dei file con nomi del tipo Spider01234.tmp
 
 
requires HttpConnection.java and Parser.java
 
Usage: java spider.Spider [-deepness <levels>] [-anydomain] [-log] [-help] <url>
 
-deepness <levels>
            specify the number of levels
-anydomain  enable search in any domain
-log        create the log file Spider.html
-help       print this help message
*/
package spider;
 
import parser.*;
import java.io.*;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
 
 
public class Spider {
	static final String cache = "cache";  // directory used to contain files
 
	static ThreadGroup TGroup = null;
	static Set<String> visited_urls = Collections.synchronizedSet(new HashSet<String>());
	static PrintWriter out = null;
	static int stalled = 0;
	static int _deepness = 1;
 
 static void error(String s) {
	 System.err.println(s);
	 System.exit(0);
 }
 
 public static void main(String args[]) throws IOException, ParserException {  
	Parser cli = new Parser();
	Option deepness = cli.addOption("-deepness", true, "<levels>","specify the number of levels");
	Option anydomain = cli.addOption("-anydomain", false, null,"enable search in any domain");
	Option log = cli.addOption("-log", false, null,"create the log file Spider.html");
	cli.addOption("-help", false, null,"print this help message");
	String [] result = cli.parse(args);
	if(result.length == 1) {
		try
		{
		  if (cli.hasOption(deepness)) _deepness = Integer.parseInt(cli.getValue(deepness));	
		}
		catch (NumberFormatException ex)
		{  error("error: deepness must be a positive integer value");	}
 
		if (_deepness < 0) error("error: deepness must be a positive integer value");
		File dir =  new File(cache);
		if (!dir.exists()) 	dir.mkdir();
	    Runtime.getRuntime().addShutdownHook(new shutdown());
		if (cli.hasOption(log)) {
			out = new PrintWriter(new FileWriter("Spider.html"));
			out.println("<html><header><title>Spider("+result[0]+")</title></header><body><table><tr><td>");
		}
		new FetchURL(result[0], _deepness, cli.hasOption(anydomain)).start();
  } else error("Usage: java Spider"+cli.getUsage("<url>"));
 }
}
 
 
 
 class shutdown extends Thread {
 
	 public void run() {
		System.out.println("=====================");
		System.out.println("Number of stalled threads="+Spider.stalled);
		System.out.println("=====================");
		System.out.println("List of visited urls:");
		Object [] urls = Spider.visited_urls.toArray();
        for (int i = 0; i < urls.length; i++)
            System.out.println((String)urls[i]);
//		Spider.TGroup.list();
		if (Spider.out != null)
		{	Spider.out.println("</td></tr></body></html>");
			Spider.out.flush();
			Spider.out.close();
		}
	 }
 }
Share Share on Facebook Share on Twitter Bookmark on Reddit Share via mail
Privacy Policy Creative Commons Attribution-Share Alike Trovami