Dynastie/misc/ljdc.perl

148 lines
3.1 KiB
Plaintext
Raw Permalink Normal View History

2014-03-27 18:29:06 +01:00
#!/usr/bin/perl
use LWP::Simple;
2015-07-08 07:40:36 +02:00
use HTML::Entities;
2014-03-27 18:29:06 +01:00
sub usage() {
print "usage: ./ljdc in_file [out.xml]\n"
}
my $size = scalar(@ARGV);
if ($size == 0 || ($size == 1 && ($ARGV[0] =~ "-h" || $ARGV[0] =~ "--help" )))
{
usage();
exit(0)
}
my $in_file = $ARGV[0];
my $out_file = "out.xml";
if ($size == 2)
{
$out_file = $ARGV[1];
}
if( ! open(IN_FILE,"<$in_file") ) {
exit(1);
}
if( ! open(OUT_FILE,">:encoding(utf-8)", $out_file) ) {
exit(1);
}
print(OUT_FILE "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
print(OUT_FILE "<ljdc>\n");
while( defined( $address = <IN_FILE> ) ) {
my $id;
chomp($address);
next if (length($address) == 0);
if ($address =~ /post\/([0-9]+)\//)
{
$id = $1;
}
else
{
print "id not found for $address\n";
next;
}
my $title = "", $img = "";
my $contents = get($address);
my @lines = split /\n/, $contents;
foreach my $line (@lines) {
2014-07-22 20:53:44 +02:00
# Title
2014-03-27 18:29:06 +01:00
if (length($title) == 0 && $line =~ /<title>Les joies du code - (.*)<\/title>/) {
$title = $1;
}
2014-07-22 20:53:44 +02:00
if (length($title) == 0 && $line =~ /<title>The coding love - (.*)<\/title>/) {
$title = $1;
}
2015-07-08 07:40:36 +02:00
if (length($title) == 0 && $line =~ /<meta property="og:title" content="([^"]+)" \/>/) {
$title = $1;
}
2014-07-22 20:53:44 +02:00
# IMG
if ($line =~ /<p class="c1">.*<img.*src="([^"]+.gif)".*\/><\/p>/) {
2014-03-27 18:29:06 +01:00
$img = $1;
last;
}
2014-07-22 20:53:44 +02:00
if ($line =~ /<div class="bodytype"> <p class="centredimg"><img .*src="([^"]+.gif)".*\/>/) {
$img = $1;
last;
}
if ($line =~ /<div class="bodytype"> <p class="e"><img .*src="([^"]+.gif)".*\/>/) {
2014-03-27 18:29:06 +01:00
$img = $1;
last;
}
2015-07-08 07:40:36 +02:00
if ($line =~ /<div class="bodytype"> <p class="e"><img .*src="([^"]+.gif)".*>/) {
$img = $1;
last;
}
if ($line =~ /<div class="bodytype"> <p class="e"><img .*src="([^"]+.jpg)".*\/>/) {
$img = $1;
last;
}
if ($line =~ /<p class="e"><img.*src="([^"]+.jpg)">/) {
$img = $1;
last;
}
if ($line =~ /<p class="e"><img.*src="([^"]+.gif)">/) {
$img = $1;
last;
}
if ($line =~ /<\/source><img src="([^"]+\.gif)">/) {
$img = $1;
last;
}
if ($line =~ /<p class="e"><img src="([^"]+.gif)">/) {
$img = $1;
last;
}
if ($line =~ /<p class="c1">.*<img.*src="([^"]+.gif)".*><\/p>/) {
$img = $1;
last;
}
if ($line =~ /<p class="centredimg"><img.*src="([^"]+.gif)".*><\/p>/) {
$img = $1;
last;
}
if ($line =~ /^<p><img.*src="(http:\/\/ljdchost.com\/[^"]+.gif)".*><\/p>$/) {
$img = $1;
last;
}
2014-03-27 18:29:06 +01:00
}
if (length($title) == 0 || length($img) == 0)
{
print "Error with $address\n";
print "Title not defined\n" if (length($title) == 0);
print "IMG not defined\n" if (length($img) == 0);
2015-07-08 07:40:36 +02:00
print { STDERR } $contents;
exit 1;
2014-03-27 18:29:06 +01:00
}
else
{
2015-07-08 07:40:36 +02:00
$title = decode_entities($title);
2014-03-27 18:29:06 +01:00
print(OUT_FILE " <entry>\n");
print(OUT_FILE " <id>$id</id>\n");
print(OUT_FILE " <address>$address</address>\n");
print(OUT_FILE " <title>$title</title>\n");
print(OUT_FILE " <img>$img</img>\n");
print(OUT_FILE " </entry>\n");
}
}
print(OUT_FILE "</ljdc>\n");
close(OUT_FILE);
print "Done !\n";