view src/pagerank/LinkConvertGraph.java @ 13:0ef7268bbbac draft

create descendiangOrder(Graph,FileOutputStream) method.
author one
date Sat, 08 Sep 2012 04:12:19 +0900
parents c7b139ff27e2
children 86567db31710
line wrap: on
line source

package pagerank;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.SAXException;

import com.tinkerpop.blueprints.Graph;
import com.tinkerpop.blueprints.Vertex;
import com.tinkerpop.blueprints.impls.tg.TinkerGraph;
import com.tinkerpop.blueprints.util.io.graphml.GraphMLWriter;

import xmlParser.TextTagParser;

public class LinkConvertGraph {
	
	private String filename;
	private FileInputStream fis;
	private SAXParserFactory factory;
	private SAXParser parser;
	private TextTagParser xmlParser;

	private HashMap<String,HashSet<String>> hash;


	LinkConvertGraph() throws ParserConfigurationException, SAXException {
		xmlParser = new TextTagParser();
		factory = SAXParserFactory.newInstance();
		parser = factory.newSAXParser();
	}

	LinkConvertGraph(final String filename) throws FileNotFoundException, ParserConfigurationException, SAXException {
		this.filename = filename;
		fis = new FileInputStream(filename);
		xmlParser = new TextTagParser();
		factory = SAXParserFactory.newInstance();
		parser = factory.newSAXParser();
	}
	
	public void setFilename(final String filename) throws FileNotFoundException {
		this.filename = filename;
		this.fis = new FileInputStream(filename);
	}
	
	private void parseXml() throws SAXException, IOException {
		parser.parse(this.fis, this.xmlParser);
		hash = xmlParser.getHash();
	}
	
	private HashMap<String,HashSet<String>> getHash() {
		return hash;
	}

	public void printHash() {
		for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) {
			String title = entry.getKey();
			System.out.println("title: " + title);
			for (String link : entry.getValue()) {
				System.out.println("\t"+link);
			}
			System.out.println();
		}		
	}
	
	private void printHash(FileOutputStream os) throws IOException {
		for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) {
			String title = entry.getKey();
			os.write( ("title: " + title + "\n").getBytes());
			for (String link : entry.getValue()) {
				os.write( ("\t"+link+"\n").getBytes());
			}
			os.write( ("\n").getBytes());
			os.flush();
		}		
	}
	
	void printLinkLog(Map.Entry<String, HashSet<String>> map, FileOutputStream os) throws IOException {
		String title = map.getKey();
		os.write( ("title: " + title + "\n").getBytes());
		for (String link : map.getValue()) {
			os.write( ("\t"+link+"\n").getBytes());
		}
		os.write( ("\n").getBytes());
		os.flush();
	}
	
	
	
	
	public static void main(String[] args) {
//		final String filename = "./resources/article.xml";
		final String filename = "/Users/aotokage/testProgram/wiki/ja-pages_current.xml";
//		final String fileDB = "./resources/tinkerpopDB";


		final long PAGENUM = 100; 
		final String fileDB = "./resources/tinkerGraph"+ Long.toString(PAGENUM);
		final String logFile = "./resources/wikiLink"+Long.toString(PAGENUM)+".log";
		
		LinkConvertGraph lcg;

		try {
			lcg = new LinkConvertGraph(filename);
			lcg.parseXml();
//			lcg.printHash();

			HashMap<String,HashSet<String>> hash = lcg.getHash();

			Graph graph = new TinkerGraph();
			LinkToVertex ltv = new LinkToVertex(graph);

			FileOutputStream fos = new FileOutputStream(logFile);
			long countId = 0;
			for (Map.Entry<String, HashSet<String>> map : hash.entrySet()) {
				lcg.printLinkLog(map, fos);
				if (PAGENUM <= countId) break;
				String pageTitle = map.getKey();
				Vertex v;// = graph.addVertex(null);
				if ( ltv.getId(pageTitle) == null ) {
					v = ltv.createVertexWithPageTitle(pageTitle);
					ltv.setPageRank(v, (Double)0.0);
					countId++;
					if (PAGENUM <= countId) break;
				} else {
					v = ltv.getVertex(pageTitle);
				}
				
				for (String linkPageTitle : map.getValue()) {
					Vertex linkV;
					if ( ltv.getId(linkPageTitle) == null) {
						linkV = ltv.createVertexWithPageTitle(linkPageTitle);
						countId++;
						ltv.setPageRank(linkV, (Double)0.0);
					} else {
						linkV = ltv.getVertex(linkPageTitle);
					}
					ltv.setHasLink(v, linkV);
					if (PAGENUM <= countId) break;
				}
				
			}

			FileOutputStream out = new FileOutputStream(new File(fileDB));
			GraphMLWriter.outputGraph(graph, out);

		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();			
		} catch (ParserConfigurationException e) {
			e.printStackTrace();
		} catch (IOException e) {
			System.err.println("Failed to parse xml");
			e.printStackTrace();
		}

		

	}
}