#!/usr/bin/perl use LWP::Simple; use HTML::Entities; sub usage() { print "usage: ./ljdc in_file [out.xml]\n" } my $size = scalar(@ARGV); if ($size == 0 || ($size == 1 && ($ARGV[0] =~ "-h" || $ARGV[0] =~ "--help" ))) { usage(); exit(0) } my $in_file = $ARGV[0]; my $out_file = "out.xml"; if ($size == 2) { $out_file = $ARGV[1]; } if( ! open(IN_FILE,"<$in_file") ) { exit(1); } if( ! open(OUT_FILE,">:encoding(utf-8)", $out_file) ) { exit(1); } print(OUT_FILE "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"); print(OUT_FILE "<ljdc>\n"); while( defined( $address = <IN_FILE> ) ) { my $id; chomp($address); next if (length($address) == 0); if ($address =~ /post\/([0-9]+)\//) { $id = $1; } else { print "id not found for $address\n"; next; } my $title = "", $img = ""; my $contents = get($address); my @lines = split /\n/, $contents; foreach my $line (@lines) { # Title if (length($title) == 0 && $line =~ /<title>Les joies du code - (.*)<\/title>/) { $title = $1; } if (length($title) == 0 && $line =~ /<title>The coding love - (.*)<\/title>/) { $title = $1; } if (length($title) == 0 && $line =~ /<meta property="og:title" content="([^"]+)" \/>/) { $title = $1; } # IMG if ($line =~ /<p class="c1">.*<img.*src="([^"]+.gif)".*\/><\/p>/) { $img = $1; last; } if ($line =~ /<div class="bodytype"> <p class="centredimg"><img .*src="([^"]+.gif)".*\/>/) { $img = $1; last; } if ($line =~ /<div class="bodytype"> <p class="e"><img .*src="([^"]+.gif)".*\/>/) { $img = $1; last; } if ($line =~ /<div class="bodytype"> <p class="e"><img .*src="([^"]+.gif)".*>/) { $img = $1; last; } if ($line =~ /<div class="bodytype"> <p class="e"><img .*src="([^"]+.jpg)".*\/>/) { $img = $1; last; } if ($line =~ /<p class="e"><img.*src="([^"]+.jpg)">/) { $img = $1; last; } if ($line =~ /<p class="e"><img.*src="([^"]+.gif)">/) { $img = $1; last; } if ($line =~ /<\/source><img src="([^"]+\.gif)">/) { $img = $1; last; } if ($line =~ /<p class="e"><img src="([^"]+.gif)">/) { $img = $1; last; } if ($line =~ /<p class="c1">.*<img.*src="([^"]+.gif)".*><\/p>/) { $img = $1; last; } if ($line =~ /<p class="centredimg"><img.*src="([^"]+.gif)".*><\/p>/) { $img = $1; last; } if ($line =~ /^<p><img.*src="(http:\/\/ljdchost.com\/[^"]+.gif)".*><\/p>$/) { $img = $1; last; } } if (length($title) == 0 || length($img) == 0) { print "Error with $address\n"; print "Title not defined\n" if (length($title) == 0); print "IMG not defined\n" if (length($img) == 0); print { STDERR } $contents; exit 1; } else { $title = decode_entities($title); print(OUT_FILE " <entry>\n"); print(OUT_FILE " <id>$id</id>\n"); print(OUT_FILE " <address>$address</address>\n"); print(OUT_FILE " <title>$title</title>\n"); print(OUT_FILE " <img>$img</img>\n"); print(OUT_FILE " </entry>\n"); } } print(OUT_FILE "</ljdc>\n"); close(OUT_FILE); print "Done !\n";