Content of Spider.java extracted from spider.jar
/* The program Spider.java fetches a web page, then parse the page to get hyperlinks to other pages, again and again... version 0.1: initial version version 0.1.5: added download of not html files version 0.2 changed the name files are named and corrected some errors, see tag TAG-0001 version 0.3 adapted to JDK 5 version 0.4: package introduced miglioramenti da introdurre: - non aprire sempre una nuova connessione TCP/IP per lo stesso dominio: riusare le connessioni - introdurre GUI basata su finestre problemi con i link piĆ¹ lunghi di 255 caratteri: vengono generati dei file con nomi del tipo Spider01234.tmp requires HttpConnection.java and Parser.java Usage: java spider.Spider [-deepness <levels>] [-anydomain] [-log] [-help] <url> -deepness <levels> specify the number of levels -anydomain enable search in any domain -log create the log file Spider.html -help print this help message */ package spider; import parser.*; import java.io.*; import java.util.Set; import java.util.HashSet; import java.util.Collections; public class Spider { static final String cache = "cache"; // directory used to contain files static ThreadGroup TGroup = null; static Set<String> visited_urls = Collections.synchronizedSet(new HashSet<String>()); static PrintWriter out = null; static int stalled = 0; static int _deepness = 1; static void error(String s) { System.err.println(s); System.exit(0); } public static void main(String args[]) throws IOException, ParserException { Parser cli = new Parser(); Option deepness = cli.addOption("-deepness", true, "<levels>","specify the number of levels"); Option anydomain = cli.addOption("-anydomain", false, null,"enable search in any domain"); Option log = cli.addOption("-log", false, null,"create the log file Spider.html"); cli.addOption("-help", false, null,"print this help message"); String [] result = cli.parse(args); if(result.length == 1) { try { if (cli.hasOption(deepness)) _deepness = Integer.parseInt(cli.getValue(deepness)); } catch (NumberFormatException ex) { error("error: deepness must be a positive integer value"); } if (_deepness < 0) error("error: deepness must be a positive integer value"); File dir = new File(cache); if (!dir.exists()) dir.mkdir(); Runtime.getRuntime().addShutdownHook(new shutdown()); if (cli.hasOption(log)) { out = new PrintWriter(new FileWriter("Spider.html")); out.println("<html><header><title>Spider("+result[0]+")</title></header><body><table><tr><td>"); } new FetchURL(result[0], _deepness, cli.hasOption(anydomain)).start(); } else error("Usage: java Spider"+cli.getUsage("<url>")); } } class shutdown extends Thread { public void run() { System.out.println("====================="); System.out.println("Number of stalled threads="+Spider.stalled); System.out.println("====================="); System.out.println("List of visited urls:"); Object [] urls = Spider.visited_urls.toArray(); for (int i = 0; i < urls.length; i++) System.out.println((String)urls[i]); // Spider.TGroup.list(); if (Spider.out != null) { Spider.out.println("</td></tr></body></html>"); Spider.out.flush(); Spider.out.close(); } } }