/**
 * TreeParser.java
 *
 * @author	Michael Privat
 * @version 1.2 - March 1998
 */

import java.net.*;
import java.io.*;
import java.util.Date;
import java.text.DateFormat;

public class TreeParser extends Thread {
	final int OUT_DOCUMENT		= -1;	// Document not in explored web site
	final int NO_HTML 			= -2; // Document is not HTML file

	// Standards HTML codes
	final int HTTP_ERROR			= 777;
	final int HTTP_OK				= 200;
	final int HTTP_REDIRECTED	= 302;
	final int HTTP_NO_DOCUMENT	= 404;

	// Information on current explored file
	String Title = "";
	String date = "";
	int ct_len  	= 0;

	URL targetUrl;
	String webSite;
	Explorer explorer;
	WebTree wt;
	int PORT;
	boolean parent;
	String startDir;

	public TreeParser(Explorer r, URL name, boolean p) {
		targetUrl = name;
		webSite = targetUrl.getHost();
		explorer = r;
		parent = p;
		startDir = getDir(targetUrl.getFile());
	}

	/**
	 * Starts parsing
	 */
	public void run() {
		PORT = 80;
		wt = new WebTree(new Info("root", Info.DIR, "WebSite name", "", 0));
		parse(targetUrl);
		System.out.println("Parsing finished");
		explorer.endParsing();
		//wt.display();
	}

	private void parse(URL url) {
		HTMLParser html = null;
		Zone zone = new Zone(false, false, false, false, false);
		String curDir = getDir(url.getFile());	
		try {
			html = new HTMLParser(url);
			byte c;
			boolean src = false;
			String reparse;
			while( (c = (byte)html.read()) != -1 ) {
				if(c=='<') zone = html.getTag(zone);
				else if(c=='>') zone.reset();
				else {
					if((zone.hrefZone() || zone.scriptZone())&& (c=='h' || c=='H')) {
						html.unread(c);
						if( !(src=html.parseSrc("href")) )
							c = (byte)html.read();
					}
					else if(zone.backgroundZone() && (c=='b' || c=='B')) {
						html.unread(c);
						if( !(src=html.parseSrc("background")) )
							c = (byte)html.read();
					}
					else if((zone.srcZone() || zone.scriptZone()) && (c=='s' || c=='S')) {
						html.unread(c);
						if( !(src=html.parseSrc("src")) )
							c = (byte)html.read();
					}
					else if((zone.appletZone()) && (c=='c' || c=='C')) {
						html.unread(c);
						if( !(src=html.parseSrc("code")) )
							c = (byte)html.read();
					}
					else if((zone.appletZone()) && (c=='a' || c=='A')) {
						html.unread(c);
						if( !(src=html.parseSrc("archive")) )
							c = (byte)html.read();
					}
				}

				if(src) {
					src = false;
					reparse = html.getReference();
				}
				else reparse = "";

				if(!reparse.equals("")) {
				// Sticks protocol, web site and reparser
				URL urlFound = null;
				if(reparse.startsWith("/"))
					urlFound = new URL("http", webSite, reparse);
				else if(reparse.indexOf("://") != -1)
					urlFound = new URL(reparse);
				else
					urlFound = new URL("http", webSite, curDir + '/' + reparse);

				// Check out connection validity
				try {
					if(getDir(urlFound.getFile()).startsWith(startDir) || parent) {
						int value = validate(urlFound);
						if(value != HTTP_ERROR) {
							// Recursive call HERE
							String abs = checkPATH(urlFound.getFile());
							if(wt.add(new Info(abs, value, Title, date, ct_len))) {
								URL urlReady = new URL(urlFound.getProtocol(),
																urlFound.getHost(),
																abs);
								//System.out.println("###"+urlReady);
								explorer.notify(new Info(abs, value, Title, date, ct_len));
								// Recursive only with HTML files
								if(value == Info.HTML || value == Info.DIR)
									parse(urlReady);
							}
						}
					}
				}
				catch(IOException ignore) {}
				}
			}
		}
		catch(IOException e) {
			//System.err.println("IO Error: " + e.getMessage());
		}
	}

	private String getDir(String f) {
		// Only HTML files need to be detected for curent dir
		// since other files won't be parsed
		if( !f.endsWith(".htm")		&&
			 !f.endsWith(".html") ) {
				if(f.endsWith("/")) return f.substring(0, f.length()-1);
				else return f;
		}

		int idx = f.lastIndexOf('/');

		return f.substring(0, idx);
	}

	/**
	 * Validates an URL, returns HTTP_ERROR if document can't be found 
	 * else returns the corresponding Info constant code to the doc type
	 * It also updates the Title static variable if url correspond to
	 * a HTML file
	 * @see	Info.java
	 */
	private int validate(URL url) throws IOException {
		long dt = 0;
		try {
			// External links forbiden (For the moment)
			if(!webSite.equals(url.getHost())) return HTTP_ERROR;
		}
		catch(Exception e) {
			return HTTP_ERROR;
		}

		String content = "";
		int len = 0;

		try {
			URLConnection cnx = url.openConnection();
			content = cnx.getContentType();
			len = cnx.getContentLength();
			dt = cnx.getDate();
			date = new String(DateFormat.getDateInstance().format(new Date(dt)));
			if(cnx.getHeaderField(0).startsWith("HTTP/1.0 404")) return HTTP_ERROR;
			ct_len = len;
		}
		catch(Exception e) {
			return HTTP_ERROR;
		}
 
		int contentType = -1;
		try {
		if(content.indexOf("text/html") != -1) contentType = Info.HTML;
		if(content.indexOf("image/gif") != -1) contentType = Info.GIF;
		if(content.indexOf("image/jpeg") != -1) contentType = Info.JPEG;
		if(content.indexOf("application/pdf") != -1) contentType = Info.PDF;
		if(content.indexOf("application/x-pdf") != -1) contentType = Info.PDF;
		if(content.indexOf("audio/x-mpeg") != -1) contentType = Info.MPEG;
		if(content.indexOf("text/plain") != -1) contentType = Info.TXT;
		if(content.indexOf("application/postscript") != -1) contentType = Info.PS;
		if(content.indexOf("application/x-gtar") != -1) contentType = Info.GTAR;
		if(content.indexOf("application/x-tar") != -1) contentType = Info.TAR;
		if(content.indexOf("application/x-zip-compressed") != -1) contentType = Info.ZIP;
		if(content.indexOf("application/x-mpeg2") != -1) contentType = Info.MPEG2;
		if(content.indexOf("application/x-wav") != -1) contentType = Info.WAV;
		if(content.indexOf("application/x-bmp") != -1) contentType = Info.BMP;
		if(content.indexOf("application/x-tiff") != -1) contentType = Info.TIFF;
		if(content.indexOf("application/msword") != -1) contentType = Info.DOC;
		if(content.indexOf("application/x-java-vm") != -1) contentType = Info.CLASS;
		}
		catch(Exception ignore) {}

		if(contentType == -1) contentType = Info.UNKW;

		if(len == 0) contentType = Info.DIR;
		if(ct_len <= 0) ct_len = 0;

		switch(contentType) {
			case Info.HTML :
      		try {
         		InputStream ins = url.openStream();
         		BufferedReader in = new BufferedReader(new InputStreamReader(ins));

					Title = in.readLine();
					int val=0;
					if(Title!=null)
					while((val=Title.toLowerCase().indexOf("<title>"))==-1) {
						Title = in.readLine();
						if(Title==null) break;
					}

					if(Title == null) Title = "Unknown title";
					else {
						Title=Title.substring(7+val);
						int ttl = Title.toLowerCase().indexOf("</title>");
						if(ttl!=-1) Title=Title.substring(0, Title.length()-8);
						Title=VireTheAccents(Title);
					}
					in.close();
					ins.close();
				}
				catch(Exception e) {
					return HTTP_ERROR;
				}
				break;
			case Info.GIF :
				Title = "GIF image";
				break;
			case Info.TXT :
				Title = "Plain text";
				break;
			case Info.PDF :
				Title = "PDF document";
				break;
			case Info.MPEG :
				Title = "MPEG multimedia";
				break;
			case Info.PS :
				Title = "Postscript document";
				break;
			case Info.GTAR :
				Title = "GTAR archive";
				break;
			case Info.TAR :
				Title = "TAR archive";
				break;
			case Info.ZIP :
				Title = "ZIP archive";
				break;
			case Info.MPEG2 :
				Title = "MPEG2 multimedia";
				break;
			case Info.WAV :
				Title = "WAV audio";
				break;
			case Info.BMP :
				Title = "BMP image";
				break;
			case Info.TIFF :
				Title = "TIFF image";
				break;
			case Info.JPEG :
				Title = "JPEG image";
				break;
			case Info.DOC :
				Title = "MS-Word Document";
				break;
			case Info.CLASS :
				Title = "Java byte code";
				break;
			case Info.UNKW :
				Title = "Unregistered type";
				break;
		}

		return contentType;
	}

	/**
	 * Check PATH path to eradicate ./ and ../
	 */
	private String checkPATH(String path) {
		String result = new String(path);

		while(result.startsWith("./") ) result = result.substring(2);
		while(result.startsWith("../") ) result = result.substring(3);

		int idx;
		while( (idx = result.indexOf("/./")) != -1) {
			result = result.substring(0, idx+1) + result.substring(idx+3);
		}

		while( (idx = result.lastIndexOf("/../")) != -1) {
			result = result.substring(0, result.lastIndexOf("/", idx-1) +1) +
						result.substring(idx+4);
		}

		return result;
	}

	private String VireTheAccents(String Title) {
		String tmp = Title;

		int idx = tmp.indexOf("acute;");
		if(idx > 1)
		while(tmp.charAt(idx-2)=='&') {
			tmp = tmp.substring(0, idx-2) + acute(tmp.charAt(idx-1))
					+ tmp.substring(idx+6);
			idx = tmp.indexOf("acute;");
			if(idx<2) break;
		}

		idx = tmp.indexOf("circ;");
		if(idx > 1)
		while(tmp.charAt(idx-2)=='&') {
			tmp = tmp.substring(0, idx-2) + circ(tmp.charAt(idx-1))
					+ tmp.substring(idx+5);
			idx = tmp.indexOf("circ;");
			if(idx<2) break;
		}

		idx = tmp.indexOf("grave;");
		if(idx > 1)
		while(tmp.charAt(idx-2)=='&') {
			tmp = tmp.substring(0, idx-2) + grave(tmp.charAt(idx-1))
					+ tmp.substring(idx+6);
			idx = tmp.indexOf("grave;");
			if(idx<2) break;
		}

		return tmp;
	}

	private char acute(char c) {
		switch(c) {
			case 'e' : return 'é';
			case 'a' : return 'á';
		}
		return c;
	}

	private char grave(char c) {
		switch(c) {
			case 'e' : return 'è';
			case 'a' : return 'à';
		}
		return c;
	}

	private char circ(char c) {
		switch(c) {
			case 'e' : return 'ê';
			case 'a' : return 'â';
		}
		return c;
	}
}

