| | 20 | |
| | 21 | '''1. Split the xml dump''' |
| | 22 | |
| | 23 | Split the xml file, if extracted wikipedia dump have huge files. For example, after unziping the English wikipedia dump will be approx. 16GB. |
| | 24 | |
| | 25 | {{{ |
| | 26 | export MARY_BASE="[PATH TO MARY BASE]" |
| | 27 | |
| | 28 | export CLASSPATH="$MARY_BASE/java/:\ |
| | 29 | $MARY_BASE/java/mary-common.jar:\ |
| | 30 | $MARY_BASE/java/log4j-1.2.8.jar:\ |
| | 31 | $MARY_BASE/java/mary-english.jar:\ |
| | 32 | $MARY_BASE/java/freetts.jar:\ |
| | 33 | $MARY_BASE/java/jsresources.jar:\ |
| | 34 | $MARY_BASE/java/mysql-connector-java-5.1.7-bin.jar\ |
| | 35 | $MARY_BASE/java/httpclient-4.0-alpha4.jar:\ |
| | 36 | $MARY_BASE/java/httpcore-4.0-beta2.jar:\ |
| | 37 | $MARY_BASE/java/httpcore-nio-4.0-beta2.jar:\ |
| | 38 | $MARY_BASE/java/commons-lang-2.4.jar" |
| | 39 | |
| | 40 | |
| | 41 | java -Xmx512m -classpath $CLASSPATH -Djava.endorsed.dirs=$MARY_BASE/lib/endorsed \ |
| | 42 | -Dmary.base=$MARY_BASE marytts.tools.dbselection.WikipediaDumpSplitter \ |
| | 43 | -xmlDump "enwiki-latest-pages-articles.xml" \ |
| | 44 | -outDir "/home/username/xml_splits/" \ |
| | 45 | -maxPages 50000 |
| | 46 | |
| | 47 | }}} |
| | 48 | |
| | 49 | '''2. Make a list of split xml files''' |
| | 50 | |
| | 51 | Make a single file with a list of split xml files. |
| | 52 | |
| | 53 | For example: wiki_files.list |
| | 54 | |
| | 55 | {{{ |
| | 56 | wikipedia/en/xml_splits/page1.xml |
| | 57 | wikipedia/en/xml_splits/page2.xml |
| | 58 | wikipedia/en/xml_splits/page3.xml |
| | 59 | wikipedia/en/xml_splits/page4.xml |
| | 60 | wikipedia/en/xml_splits/page5.xml |
| | 61 | wikipedia/en/xml_splits/page6.xml |
| | 62 | }}} |
| | 63 | |
| | 64 | |
| | 65 | '''3. Clean text and make mysql database''' |
| | 66 | |
| | 67 | Clean text in all xml files and make mysql database. |
| | 68 | |
| | 69 | please follow below steps: |
| | 70 | a. create a database in mysql |
| | 71 | |
| | 72 | {{{ |
| | 73 | create database MaryDBSelector; |
| | 74 | |
| | 75 | }}} |
| | 76 | |
| | 77 | |
| | 78 | b. run below script to clean text and to make mysql database: |
| | 79 | |
| | 80 | |
| | 81 | {{{ |
| | 82 | export MARY_BASE="[PATH TO MARY BASE]" |
| | 83 | |
| | 84 | export CLASSPATH="$MARY_BASE/java/:\ |
| | 85 | $MARY_BASE/java/mary-common.jar:\ |
| | 86 | $MARY_BASE/java/log4j-1.2.8.jar:\ |
| | 87 | $MARY_BASE/java/mary-english.jar:\ |
| | 88 | $MARY_BASE/java/freetts.jar:\ |
| | 89 | $MARY_BASE/java/jsresources.jar:\ |
| | 90 | $MARY_BASE/java/mysql-connector-java-5.1.7-bin.jar:\ |
| | 91 | $MARY_BASE/java/httpclient-4.0-alpha4.jar:\ |
| | 92 | $MARY_BASE/java/httpcore-4.0-beta2.jar:\ |
| | 93 | $MARY_BASE/java/httpcore-nio-4.0-beta2.jar:\ |
| | 94 | $MARY_BASE/java/commons-lang-2.4.jar:\ |
| | 95 | $MARY_BASE/java/mwdumper-2008-04-13.jar" |
| | 96 | |
| | 97 | java -Xmx1000m -classpath $CLASSPATH -Djava.endorsed.dirs=$MARY_BASE/lib/endorsed \ |
| | 98 | -Dmary.base=$MARY_BASE marytts.tools.dbselection.WikipediaProcessor \ |
| | 99 | -locale "en_US" \ |
| | 100 | -mysqlHost "localhost" \ |
| | 101 | -mysqlUser "username" \ |
| | 102 | -mysqlPasswd "password" \ |
| | 103 | -mysqlDB "MaryDBSelector" \ |
| | 104 | -listFile "wiki_files.list" |
| | 105 | }}} |
| | 106 | |