Wikipedia:編集回数の多いページの一覧/一覧データを生成する方法
一覧データを生成する方法の一例を説明します。 このページで生成方法を説明する一覧データのページは、次のとおりです。
- ja:Wikipedia:編集回数の多いページの一覧
- en:Wikipedia:Most frequently edited pages
- de:Wikipedia:Seiten nach Zahl von Bearbeitungen
- zh:Wikipedia:最多修订页面
このページで説明する方法による一覧の生成には、Javaのプログラムをコンパイル/実行する方法について、若干の知識が必要となります(高度な知識は必要ありません)。 一覧の生成では、コンピュータを使い、そのコンピュータ上でJava開発/実行環境とJavaプログラムを使います。
前提
編集- Javaが動作するシステムのコンピュータが一台必要となります。例えば、次のようなコンピュータシステムです。
- UNIXもしくはUNIXに似たシステム(macOS、Linux など)
- Windows
- Java(Java SE 5.0 以上)の開発/実行環境が予めコンピュータに導入されている必要があります。
- 次のJavaプログラムをコンパイルしてコンピュータ内に配置しておきます。
Namespaces.java
import java.util.HashMap;
import java.util.Map;
class Namespaces {
public static final int MAIN_NAMESPACE = 0;
private final Map<String, Integer> map = new HashMap<String, Integer>();
public void add(String key, int ns) {
map.put(key, ns);
}
public int ns(String text) {
final String NAMESPACE_SEPARATOR = ":";
if (!text.contains(NAMESPACE_SEPARATOR)) {
return MAIN_NAMESPACE;
}
Integer ns = map.get(text.split(NAMESPACE_SEPARATOR)[0]);
if (ns == null) {
return MAIN_NAMESPACE;
}
return ns;
}
}
Page.java
class Page {
private final String title;
private final int ns;
private int edits;
private int totalEdits;
public String getTitle() {
return title;
}
public int getNs() {
return ns;
}
public int getEdits() {
return edits;
}
public int getTotalEdits() {
return totalEdits;
}
public Page(String title, int ns) {
this.title = title;
this.ns = ns;
}
public void incrementEdits() {
edits++;
}
public void incrementTotalEdits() {
totalEdits++;
}
}
PagesByNumberOfRecentEdits.java
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Comparator;
import java.util.Date;
import java.util.EmptyStackException;
import java.util.Stack;
import java.util.TimeZone;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public abstract class PagesByNumberOfRecentEdits {
private static final String YEARMONTH_FORMAT_STRING = "yyyy-MM";
private static final String DATE_FORMAT_STRING = YEARMONTH_FORMAT_STRING + "-dd";
protected static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat(DATE_FORMAT_STRING);
private static final String TIME_FORMAT_STRING = "HH:mm:ss";
private final Date dateStarted = new Date();
private static final String LIMIT_PROPERTY_KEY = "limit";
private int limit = 0;
private PrintWriter writer = null;
protected PrintWriter getWriter() {
return writer;
}
private Date beginTimestamp = null;
private Date endTimestamp = null;
protected Date getBeginTimestamp() {
return beginTimestamp;
}
protected Date getEndTimestamp() {
return endTimestamp;
}
protected void execute(String[] args) {
try {
final int VALID_ARGUMENT_LENGTH = 1;
if (args.length < VALID_ARGUMENT_LENGTH) {
printUsage();
System.exit(1);
}
writer = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
System.err.println("Started. " + dateStarted);
String limitText = System.getProperty(LIMIT_PROPERTY_KEY, "5000");
limit = Integer.parseInt(limitText);
final File dumpFile = new File(args[0]);
fileNameCheck(dumpFile);
final DumpHandler dumpHandler = new DumpHandler();
dumpHandler.setLimit(limit);
SAXParserFactory.newInstance().newSAXParser().parse(
new GZIPInputStream(new FileInputStream(dumpFile)), dumpHandler);
final Page[] pages = dumpHandler.getPages();
beginTimestamp = dumpHandler.getBeginTimestamp();
endTimestamp = dumpHandler.getEndTimestamp();
print(pages);
} catch (NumberFormatException e) {
System.err.println("The specified system property \"" + LIMIT_PROPERTY_KEY + "\" is not a valid integer.");
System.err.println(e);
System.exit(1);
} catch (FileNotFoundException e) {
System.err.println(e);
System.exit(1);
} catch (ParserConfigurationException e) {
e.printStackTrace();
System.exit(1);
} catch (SAXException e) {
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
} finally {
final Date dateEnded = new Date();
System.err.println("Ended. " + dateEnded);
final SimpleDateFormat dateFormat = new SimpleDateFormat(TIME_FORMAT_STRING);
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
System.err.println("Elapsed: " + dateFormat.format(new Date(dateEnded.getTime() - dateStarted.getTime())));
}
}
private static final String BEGIN_DATE_PROPERTY_KEY = "begin.date";
private static final String END_DATE_PROPERTY_KEY = "end.date";
private static final String MINIMUM_EDITS_PROPERTY_KEY = "min.edits";
private void printUsage() {
System.err.print("Usage (example): java -D" + BEGIN_DATE_PROPERTY_KEY + "=2008-04-01"
+ " -D" + END_DATE_PROPERTY_KEY + "=2008-04-30"
+ " -D" + LIMIT_PROPERTY_KEY + "=5000"
+ " -D" + MINIMUM_EDITS_PROPERTY_KEY + "=15");
System.err.print(" " + getClass().getName());
System.err.print(" " + getWikiName() + "-20080501-stub-meta-history.xml.gz");
System.err.print(" > result.txt");
System.err.println();
}
private void print(Page[] pages) {
try {
printHeader();
Arrays.sort(pages, new PagesComparator());
writer.print("{| class=\"wikitable" + getSortable() + "\"");
writer.println();
writer.print("! " + getTableHeader());
writer.println();
int rank = 0;
int prevCount = 0;
int sameRank = 0;
for (Page page : pages) {
final String rankText;
if (rank == 0) {
rank++;
sameRank = 1;
} else if (page.getEdits() < prevCount) {
rank += sameRank;
sameRank = 1;
} else {
sameRank++;
}
rankText = Integer.toString(rank);
prevCount = page.getEdits();
if (rank > limit) {
break;
}
writer.print("|-");
writer.println();
writer.print("| " + rankText);
writer.print(" || ");
writer.print("[[:" + page.getTitle() + "]]");
writer.print(" || ");
writer.print(page.getNs());
writer.print(" || ");
writer.print(page.getEdits());
writer.print(" || ");
writer.print(page.getTotalEdits());
writer.println();
}
writer.print("|}");
writer.println();
} finally {
writer.flush();
}
}
private static class PagesComparator implements Comparator<Page> {
public int compare(Page page1, Page page2) {
if (page1 == null || page2 == null) {
if (page1 == null && page2 == null) {
return 0;
}
if (page1 == null) {
return 1;
}
if (page2 == null) {
return -1;
}
}
if (page1.getEdits() != page2.getEdits()) {
return page2.getEdits() - page1.getEdits();
} else {
return page2.getTotalEdits() - page1.getTotalEdits();
}
}
}
private void fileNameCheck(File file) {
if (!file.getName().startsWith(getWikiName())) {
System.err.println("WARNING: The specified file name '" + file.getName() + "' does not start with '" + getWikiName() + "'.");
try {
Thread.sleep(5000);
} catch(InterruptedException e) {
}
}
}
protected abstract String getWikiName();
protected void printHeader() {
return;
}
protected abstract String getTableHeader();
protected final String SORTABLE = " sortable";
protected String getSortable() {
return SORTABLE;
}
private static class DumpHandler extends DefaultHandler {
private final Namespaces namespaces = new Namespaces();
private final Stack<String> elementStack = new Stack<String>();
private Date beginTimestamp = null;
private Date endTimestamp = null;
public Date getBeginTimestamp() {
return beginTimestamp;
}
public Date getEndTimestamp() {
return endTimestamp;
}
private int minimumEdits = 0;
private int limit = 0;
private void setLimit(int limit) {
this.limit = limit;
}
private static final DateFormat TIMESTAMP_DUMP_FORMAT
= new SimpleDateFormat(DATE_FORMAT_STRING + "'T'" + TIME_FORMAT_STRING + "'Z'z");
private int editsInLastMonth = 0;
private Calendar lastMonth = Calendar.getInstance();
public void startDocument() throws SAXException {
beginTimestamp = getDateProperty(BEGIN_DATE_PROPERTY_KEY);
final Calendar endTimestampCalendar = Calendar.getInstance();
endTimestampCalendar.setTime(getDateProperty(END_DATE_PROPERTY_KEY));
endTimestampCalendar.add(Calendar.HOUR, 23);
endTimestampCalendar.add(Calendar.MINUTE, 59);
endTimestampCalendar.add(Calendar.SECOND, 59);
endTimestamp = endTimestampCalendar.getTime();
lastMonth.setTime(endTimestamp);
if (endTimestampCalendar.get(Calendar.DATE) != endTimestampCalendar.getActualMaximum(Calendar.DATE)) {
lastMonth.roll(Calendar.MONTH, -1);
}
pages = new Page[(int)(limit * 1.5)];
String minimuEditsText = System.getProperty(MINIMUM_EDITS_PROPERTY_KEY, "15");
minimumEdits = Integer.parseInt(minimuEditsText);
}
public void endDocument() throws SAXException {
System.err.println("Processed: " + revisionCounter);
System.err.println("As of the last month"
+ " (" + new SimpleDateFormat(YEARMONTH_FORMAT_STRING).format(beginTimestamp) + "),"
+ " the Wikipedia received "
+ (int)(editsInLastMonth / lastMonth.getActualMaximum(Calendar.DATE))
+ " edits a day.");
System.err.println("The " + totalEdits + " total edits made to the Wikipedia.");
// System.err.println("Timestamp ParseException: " + timestampParseExceptionCount + " occured.");
}
private static Date getDateProperty(String key) throws SAXException {
String property = System.getProperty(key);
try {
return DATE_FORMAT.parse(property);
} catch (ParseException e) {
throw new SAXException(e);
}
}
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
String name = localName.equals("") ? qName : localName;
elementStack.push(name);
if (name.equals("namespace")) {
String key = "";
try {
key = atts.getValue("key");
ns = Integer.parseInt(key);
} catch (NumberFormatException e) {
throw new SAXException("ns: " + key, e);
}
}
}
private int revisionCounter = 0;
private int totalEdits = 0;
private int totalEditsInPeriod = 0;
private int ns = 0;
private String namespace = "";
private String pageTitle = "";
private Page page = null;
private Page[] pages = null;
public Page[] getPages() {
return pages;
}
private Date timestamp = null;
private String timestampString = "";
private boolean ignoreRevision = false;
private int timestampParseExceptionCount = 0;
final static private PagesComparator PAGES_COMPARATOR = new PagesComparator();
int pageCounter = 0;
public void endElement(String uri, String localName, String qName) throws SAXException {
final String name = elementStack.pop();
if (name.equals("namespace")) {
namespaces.add(namespace, ns);
ns = 0;
namespace = "";
} else if (name.equals("page")) {
if (page.getEdits() < minimumEdits) {
return;
}
if (pageCounter <= (pages.length - 1)) {
pageCounter ++;
pages[pageCounter - 1] = page;
} else if (pageCounter > (pages.length - 1)) {
final Page lastPage = pages[pages.length - 1];
if (page.getEdits() > lastPage.getEdits()) {
pageCounter ++;
pages[pages.length - 1] = page;
}
}
if (pageCounter >= limit) {
Arrays.sort(pages, PAGES_COMPARATOR);
}
} else if (name.equals("title")) {
page = new Page(pageTitle, namespaces.ns(pageTitle));
pageTitle = "";
} else if (name.equals("timestamp")) {
ignoreRevision = false;
try {
timestamp = TIMESTAMP_DUMP_FORMAT.parse(timestampString + "UTC");
timestampString = "";
} catch (ParseException e) {
timestampParseExceptionCount++;
ignoreRevision = true;
}
} else if (name.equals("revision")) {
if (ignoreRevision) {
return;
}
if (timestampBeroreOrEquals(timestamp)) {
page.incrementTotalEdits();
if (timestampIsInPeriod(timestamp)) {
page.incrementEdits();
}
}
final Calendar calendar = Calendar.getInstance();
calendar.setTime(timestamp);
if (calendar.get(Calendar.YEAR) == lastMonth.get(Calendar.YEAR)
&& calendar.get(Calendar.MONTH) == lastMonth.get(Calendar.MONTH)) {
editsInLastMonth ++;
}
if (timestampBeroreOrEquals(timestamp)) {
totalEdits ++;
if (timestampIsInPeriod(timestamp)) {
totalEditsInPeriod ++;
}
}
timestamp = null;
revisionCounter++;
final int LOG_INTERVAL = 10000;
if (revisionCounter % LOG_INTERVAL == 0) {
System.err.println("Processed: " + revisionCounter);
}
}
}
private boolean timestampIsInPeriod(Date timestamp) {
return ( timestamp.equals(beginTimestamp) || timestamp.after(beginTimestamp) )
&& timestampBeroreOrEquals(timestamp);
}
private boolean timestampBeroreOrEquals(Date timestamp) {
return ( timestamp.before(endTimestamp) || timestamp.equals(endTimestamp) );
}
public void characters (char[] ch, int start, int length) {
try {
final String elementName = elementStack.peek();
final String string = new String(ch, start, length);
if (elementName.equals("namespace")) {
namespace += string;
}
if (elementName.equals("title")) {
pageTitle += string;
}
if (elementName.equals("timestamp")) {
timestampString += string;
// if (revisionCounter % 10000 == 0) {
// System.err.println(ch.length);
// }
}
} catch (EmptyStackException e) {
// NOP
} catch (IndexOutOfBoundsException e) {
// NOP
}
}
}
}
PagesByNumberOfRecentEdits_de.java
public class PagesByNumberOfRecentEdits_de extends PagesByNumberOfRecentEdits {
/**
* The main() method for this application.
* @param args command-line arguments
*/
public static void main(String[] args) {
new PagesByNumberOfRecentEdits_de().execute(args);
}
protected String getWikiName() {
return "dewiki";
}
protected void printHeader() {
getWriter().print("Frist: "
+ DATE_FORMAT.format(getBeginTimestamp())
+ " — "
+ DATE_FORMAT.format(getEndTimestamp())
+ " (UTC)");
getWriter().println();
getWriter().println();
}
protected String getTableHeader() {
return "# !! Seite !! [[Hilfe:Namensräume|Namensräume]] !! Bearb. (30 T.) !! Bearb.";
}
}
PagesByNumberOfRecentEdits_en.java
public class PagesByNumberOfRecentEdits_en extends PagesByNumberOfRecentEdits {
/**
* The main() method for this application.
* @param args command-line arguments
*/
public static void main(String[] args) {
new PagesByNumberOfRecentEdits_en().execute(args);
}
protected String getWikiName() {
return "enwiki";
}
protected void printHeader() {
getWriter().print("Period: "
+ DATE_FORMAT.format(getBeginTimestamp())
+ " — "
+ DATE_FORMAT.format(getEndTimestamp())
+ " (UTC)");
getWriter().println();
getWriter().println();
}
protected String getTableHeader() {
return "Rank !! Page !! [[Wikipedia:Namespace|Namespace]] !! Recent Edits !! Total Edits";
}
}
PagesByNumberOfRecentEdits_ja.java
public class PagesByNumberOfRecentEdits_ja extends PagesByNumberOfRecentEdits {
/**
* The main() method for this application.
* @param args command-line arguments
*/
public static void main(String[] args) {
new PagesByNumberOfRecentEdits_ja().execute(args);
}
protected String getWikiName() {
return "jawiki";
}
protected void printHeader() {
getWriter().print("期間: "
+ DATE_FORMAT.format(getBeginTimestamp())
+ " — "
+ DATE_FORMAT.format(getEndTimestamp())
+ " (UTC)");
getWriter().println();
getWriter().println();
}
protected String getTableHeader() {
return "順位 !! ページ !! [[Help:名前空間|名前空間]] !! 編集回数 !! 総編集回数";
}
}
PagesByNumberOfRecentEdits_zh.java
public class PagesByNumberOfRecentEdits_zh extends PagesByNumberOfRecentEdits {
/**
* The main() method for this application.
* @param args command-line arguments
*/
public static void main(String[] args) {
new PagesByNumberOfRecentEdits_zh().execute(args);
}
protected String getWikiName() {
return "zhwiki";
}
protected void printHeader() {
getWriter().print("期间: "
+ DATE_FORMAT.format(getBeginTimestamp())
+ " — "
+ DATE_FORMAT.format(getEndTimestamp())
+ " (UTC)");
getWriter().println();
getWriter().println();
}
protected String getTableHeader() {
return "名次 !! 页面 !! [[Help:名字空间|名字空间]] !! 最近编辑次数 !! 累积编辑次数";
}
}
手順
編集- データベースダンプが提供されているサイトからダンプデータをダウンロードします (https://dumps.wikimedia.org/) 。データベースダンプのデータの生成は不定期に行われています。必要となるのは次のファイルです。
- stub-meta-history.xml.gz
- ウィキペディアのデータベースダンプの進捗を知るために、RSSフィードを購読することができます。
- Javaプログラムを実行して一覧データを生成します。
- 出力されたテキストファイルはウィキソースの形式になっていますので、テキストファイルをエディタで開いてコピーペーストしてウェブブラウザから一覧ページを更新することができます。
- ダウンロード元: https://dumps.wikimedia.org/jawiki/
- Javaプログラム実行例:
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=1000 -Dmin.edits=20 PagesByNumberOfRecentEdits_ja jawiki-20080501-stub-meta-history.xml.gz > result.txt
- ダウンロード元: https://dumps.wikimedia.org/enwiki/
- Javaプログラム実行例:
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=5000 -Dmin.edits=80 PagesByNumberOfRecentEdits_en enwiki-20080501-stub-meta-history.xml.gz > result.txt
- ダウンロード元: https://dumps.wikimedia.org/dewiki/
- Javaプログラム実行例:
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=1000 -Dmin.edits=40 PagesByNumberOfRecentEdits_de dewiki-20080501-stub-meta-history.xml.gz > result.txt
- ダウンロード元: https://dumps.wikimedia.org/zhwiki/
- Javaプログラム実行例:
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=1000 PagesByNumberOfRecentEdits_zh zhwiki-20080501-stub-meta-history.xml.gz > result.txt