view src/wikigraph/LinkConvertGraph.java @ 20:2c3a10047ec6 draft

add ReadWikiLink.java and LinkToNode.java
author one
date Mon, 27 Aug 2012 04:30:53 +0900
parents e01c8a8190cc
children f9ef906676eb
line wrap: on
line source

package wikigraph;

import howtouse.TextTagParser;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Transaction;
import org.neo4j.kernel.EmbeddedGraphDatabase;
import org.xml.sax.SAXException;

public class LinkConvertGraph {
	
	private String filename;
	private FileInputStream fis;
	private SAXParserFactory factory;
	private SAXParser parser;
	private TextTagParser xmlParser;

	private HashMap<String,HashSet<String>> hash;


	LinkConvertGraph() throws ParserConfigurationException, SAXException {
		xmlParser = new TextTagParser();
		factory = SAXParserFactory.newInstance();
		parser = factory.newSAXParser();
	}

	LinkConvertGraph(final String filename) throws FileNotFoundException, ParserConfigurationException, SAXException {
		this.filename = filename;
		fis = new FileInputStream(filename);
		xmlParser = new TextTagParser();
		factory = SAXParserFactory.newInstance();
		parser = factory.newSAXParser();
	}
	
	private void setFilename(final String filename) throws FileNotFoundException {
		this.filename = filename;
		this.fis = new FileInputStream(filename);
	}
	
	private void parseXml() throws SAXException, IOException {
		parser.parse(this.fis, this.xmlParser);
		hash = xmlParser.getHash();
	}
	
	private HashMap<String,HashSet<String>> getHash() {
		return hash;
	}

	private void printHash() {
		for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) {
			String title = entry.getKey();
			System.out.println("title: " + title);
			for (String link : entry.getValue()) {
				System.out.println("\t"+link);
			}
			System.out.println();
		}		
	}
	
	private void printHash(FileOutputStream os) throws IOException {
		for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) {
			String title = entry.getKey();
			os.write( ("title: " + title + "\n").getBytes());
			for (String link : entry.getValue()) {
				os.write( ("\t"+link+"\n").getBytes());
			}
			os.write( ("\n").getBytes());
			os.flush();
		}		
	}
	
	
	
	
	
	public static void main(String[] args) {
//		final String filename = "./resource/article.xml";
		final String filename = "/Users/aotokage/testProgram/wiki/ja-pages_current.xml";
		
		LinkConvertGraph lcg;


		try {

			lcg = new LinkConvertGraph(filename);

			lcg.parseXml();
//			lcg.printHash();

			FileOutputStream fos = new FileOutputStream("./resource/wikilink.log");
			lcg.printHash(fos);

			HashMap<String,HashSet<String>> hash = lcg.getHash();

			GraphDatabaseService graphDb = new EmbeddedGraphDatabase("wikiLinkDB_");
			LinkToNode ltn = new LinkToNode(graphDb);
			
			Transaction tx = graphDb. beginTx();

			for (Map.Entry<String,HashSet<String>> map : hash.entrySet()) {
				String pageTitle = map.getKey();
				
				Node pageNode;
				if ( ltn.getId(pageTitle) == null) {
					pageNode = ltn.createNodeWithPageTitle(pageTitle);
				} else {
					pageNode = ltn.getNode(pageTitle);
				}
				
				for (String linkPageTitle : map.getValue()) {
					Node linkNode;
					if ( ltn.getId(linkPageTitle) == null) {
						linkNode = ltn.createNodeWithPageTitle(linkPageTitle);
					} else {
						linkNode = ltn.getNode(linkPageTitle);
					}
					ltn.setHasLink(pageNode, linkNode);
					
				}				
				
			}

			
//			ltn.printAllNodes();

			tx.success();
			tx.finish();
			graphDb.shutdown();

		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();			
		} catch (ParserConfigurationException e) {
			e.printStackTrace();
		} catch (IOException e) {
			System.err.println("Failed to parse xml");
			e.printStackTrace();
		}

		

	}
}