I'm trying to get a whole bunch of values from around 3,000 HTML files and save them to a spreadsheet.
I'm using
HTML::TreeBuilder to process the HTML and creating a spreadsheet using
Spreadsheet::WriteExcel.
But my script doesn't successfully get the values. I see
Use of uninitialized value $val in concatenation (.) or string at spreadsheet.pl line 63.
What might I be doing wrong?
Here's an example of my HTML files on pastebin.com. It was too large to post in the question.
My Perl code
use warnings 'all';
use strict;
use LWP::Simple 'get';
use Spreadsheet::WriteExcel;
use HTML::TreeBuilder;
use Path::Tiny;
use constant URL => 'http://pastebin.com/raw/qLwu80ZW';
my $teamNumber = "";
my $teamName = "";
my $schoolName = "";
my $area = "";
my $district = "";
my $agDeptPhone = "";
my $schoolPhone = "";
my $fax = "";
my $addressOne = "";
my $addressTwo = "";
my $city = "";
my $state = "";
my $zipCode = "";
my $name = "";
my $email = "";
my $row = "";
my $Ypos = 0;
my $path = "Z:\\_WEB_CLIENTS\\Morgan Livestock\\Judging Card";
my $workbook = Spreadsheet::WriteExcel->new('perlOutput.xlsx');
my $worksheet = $workbook->add_worksheet();
sub getTeamNumber {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_TeamNumber/)->attr('value');
}
print "Got Team Number $val\n";
return $val;
}
sub getTeamName {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_TeamName/)->attr('value');
}
print "Got Team Name $val\n";
return $val;
}
sub getSchoolName {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(tag_ => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_SchoolName/)->attr('value');
}
print "Got School Name $val\n";
return $val;
}
sub getArea{
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(tag_ => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Area/)->attr('value');
}
print "Got Area $val\n";
return $val;
}
sub getDistrict{
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_District/)->attr('value');
}
print "Got District $val\n";
return $val;
}
sub getDeptPhone {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Phone/)->attr('value');
}
print "Got Dept Phone $val\n";
return $val;
}
sub getSchoolPhone{
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Phone2/)->attr('value');
}
print "Got School Phone $val\n";
return $val;
}
sub getFax{
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Fax/)->attr('value');
}
print "Got Fax $val\n";
return $val;
}
sub getAddress1 {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Address1/)->attr('value');
}
print "Got Address One $val\n";
return $val;
}
sub getAddress2 {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Address2/)->attr('value');
}
print "Got Address Two $val\n";
return $val;
}
sub getCity {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_City/)->attr('value');
}
print "Got Address Two $val\n";
return $val;
}
sub getState {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_State/)->attr('value');
}
print "Got State $val\n";
return $val;
}
sub getZip {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Zip/)->attr('value');
}
print "Got Zip $val\n";
return $val;
}
sub getWebsite {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $val;
foreach my $node (@nodes) {
$val = $node->look_down('name', qr/\$txt_Website/)->attr('value');
}
print "Got Website $val\n";
return $val;
}
sub getNameAndEmail {
my ($file) = @_;
my $tree = HTML::TreeBuilder->new_from_content(get URL);
my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');
for my $tr ( $table->look_down(_tag => 'tr') ) {
next unless my @td = $tr->look_down(_tag => 'td');
my ($name, $email) = map { $_->as_trimmed_text } @td[0,1];
}
print "Got Name and Email $name and $email\n";
return ($name, $email);
}
# FILLER: This fills the spreadsheet with all the variables we've acquired
sub fill {
my ($name, $email, $teamNumber, $teamName, $schoolName,
$area, $district, $agDeptPhone, $schoolPhone,
$fax, $addressOne, $addressTwo, $city, $state, $zipCode) = (@_);
$worksheet->write($Ypos, 1, $name);
$worksheet->write($Ypos, 2, $email);
$worksheet->write($Ypos, 3, $teamNumber);
$worksheet->write($Ypos, 4, $teamName);
$worksheet->write($Ypos, 5, $schoolName);
$worksheet->write($Ypos, 6, $area);
$worksheet->write($Ypos, 7, $district);
$worksheet->write($Ypos, 8, $agDeptPhone);
$worksheet->write($Ypos, 9, $schoolPhone);
$worksheet->write($Ypos, 10, $fax);
$worksheet->write($Ypos, 11, $addressOne);
$worksheet->write($Ypos, 12, $addressTwo);
$worksheet->write($Ypos, 13, $city);
$worksheet->write($Ypos, 14, $state);
$worksheet->write($Ypos, 15, $zipCode);
}
# Open judgingcard directory
opendir (DIR, $path) or die "Unable to open directory 'Judging Card': $!";
my @files = readdir(DIR);
# This fills out all top row info
$worksheet->write("A1", "Name");
$worksheet->write("B1", "Email");
$worksheet->write("C1", "Team Number");
$worksheet->write("D1", "Team Name");
$worksheet->write("E1", "School Name");
$worksheet->write("F1", "Area");
$worksheet->write("G1", "District");
$worksheet->write("H1", "Ag Dept Phone");
$worksheet->write("I1", "School Phone");
$worksheet->write("J1", "Fax");
$worksheet->write("K1", "Address One");
$worksheet->write("L1", "Address Two");
$worksheet->write("M1", "City");
$worksheet->write("N1", "State");
$worksheet->write("O1", "Zip Code");
###################################
foreach my $file (@files) { # run through all files in directory
next if (-d $file); # Skip file if file is folder
$Ypos = $Ypos + 1;
my ($name1, $email1) = getNameAndEmail($file);
$name = $name1;
$email = $email1;
$teamNumber = getTeamNumber($file);
$teamName = getTeamName($file);
$schoolName = getSchoolName($file);
$area = getArea($file);
$district = getDistrict($file);
$agDeptPhone = getDeptPhone($file);
$schoolPhone = getSchoolPhone($file);
$fax = getFax($file);
$addressOne = getAddress1($file);
$addressTwo = getAddress2($file);
$city = getCity($file);
$state = getState($file);
$zipCode = getZip($file);
fill($name, $email, $teamNumber, $teamName, $schoolName,
$area, $district, $agDeptPhone, $schoolPhone, $fax,
$addressOne, $addressTwo, $city, $state, $zipCode);
print "Progressing $file ($Ypos)\n"
}
closedir(DIR);
sub getTeamNumber {
my ($file) = @_;
my $html = path($file);
my $tree = HTML::TreeBuilder->new_from_content($html);
my @nodes = $tree->look_down(_tag => 'input');
my $name;
my $val;
foreach my $node (@nodes) {
$name = $node->look_down('name', qr/\$txt_TeamNumber/);
}
if ( ! defined $name ) {
print "Couldn't get team number\n";
}
if ( $name ) {
$val = $name->attr('value');
print "Got Team number $val\n";
}
return $val;
}
New script:
use LWP::Simple 'get';
use Spreadsheet::WriteExcel;
use HTML::TreeBuilder;
use Path::Tiny;
my $path = "Z:\\_WEB_CLIENTS\\Morgan Livestock\\Judging Card";
my $workbook = Spreadsheet::WriteExcel->new('perlOutput.xlsx');
my $worksheet = $workbook->add_worksheet();
opendir (DIR, $path) or die "Unable to open directory 'Judging Card': $!";
my @files = readdir(DIR);
# Specify spreadsheet headers in desired order and write to file
my @headers = ('Name', 'Email', 'Team Number', 'Team Name', 'School Name', 'Area', 'District', 'Ag Dept Phone', 'School Phone', 'Fax', 'Address One'
, 'Address Two', 'City', 'State', 'Zip Code');
$worksheet->write_row(0, 0, \@headers); # first row
# Build ancillary data structures to later sort results by this order
# each header with its index from @headers (specifies columns' order)
my %ho = map { state $idx; $_ => ++$idx } @headers;
# each name (`TeamNumber` ...) with the index of its header
my %name_order = ( Name => $ho{Name}, Email => $ho{Email},
TeamNumber => $ho{'Team Number'}, TeamName => $ho{'Team Name'}, SchoolName => $ho{'School Name'}, Area => $ho{'Area'}, District => $ho{'District'},
AgDeptPhone => $ho{'Ag Dept Phone'}, SchoolPhone => $ho{'School Phone'}, Fax => $ho{'Fax'}, AddressOne => $ho{'Address One'},
AddressTwo => $ho{'Address Two'}, City => $ho{'City'}, State => $ho{'State'}, Zip => $ho{'Zip Code'});
sub getNames {
my ($file) = @_;
my $tree = HTML::TreeBuilder->new_from_content( path($file) );
my @nodes = $tree->look_down(_tag => 'input');
# List phrases to find, and build hash with their derived names
# Should probably be defined globally, once for the whole program
my @patterns = map { '$txt_' . $_ }
qw(TeamName TeamNumber SchoolName Area District
Phone Phone2 Fax Address1 Address2 City State Zip Website);
# Name for each pattern: everything after first _ (so after $txt_)
my %patt_name = map { $_ => (/[^_]+_(.*)/)[0] } @patterns;
my %name_val;
foreach my $node (@nodes) {
foreach my $patt (@patterns) {
my $name = $node->look_down('name', qr/\Q$patt/);
if ($name) {
$name_val{$patt_name{$patt}} = $name->attr('value') || '';
}
}
}
# Name and Email are stored differently. Fetch those now
my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');
for my $tr ( $table->look_down(_tag => 'tr') ) {
next unless my @td = $tr->look_down(_tag => 'td');
# Discard incomplete Name-Email records -- either both or none
@name_val{qw(Name Email)} =
map { (defined) ? $_->as_trimmed_text : '' } @td[0,1];
}
return \%name_val;
}
sub fill_row {
my ($ws, $row, $rdata, $rorder) = @_;
my %name_val = %$rdata;
my %name_order = %$rorder;
my @vals = map { $name_val{$_} }
sort { $name_order{$a} <=> $name_order{$b} }
keys %name_val;
$ws->write_row($row, 0, \@vals); # add check (returns 0 on success)
return 1;
my $row = 1;
}
foreach my $file (@files) {
next if -d $file;
my %name_val = %{ getNames($file) };
foreach my $name (sort keys %name_val) {
# Fill the spreadsheet with all info in one go
if ($name_val{$name}) {
print "$name => $name_val{$name}\n";
} else {
print "Not found $name in $file\n";
}
}
my %name_val = %{ getNames($file) };
fill_row($worksheet, $row++, \%name_val, \%name_order);
foreach my $name (sort keys %name_val) { # demo
if ($name_val{$name}) { print "$name => $name_val{$name}\n" }
else { print "Not found $name in $file\n" }
}
print "Progressing $Ypos \n"
}
In short, some of those
'name'are probably just not found in (some of) HTML files. So test first to see whether it is there, then write to$valor print message about it not being found.The most apparent thing to be improved: there is no need for separate functions. You can search and find all of them in one call, and store them in the hash
name => value, which is returned.For
NameandEmailwe require both to be there as text, or both are thrown out. (The sample source hasThere are no people ...inside adivforName, and nothing forEmail.)To get whatever is there, instead of
if-elseabove useand we get the note quoted above for
Nameand an empty string forEmail, with this sample.Then
The write_row takes a reference to an array and writes out a row with its elements. Note that
writecan be used that way, too, when an arrayref is given.The output on the linked HTML file
and
Not found ...for others. The.xlsfile is correct (when the full list of names is used).The whole program
This works as a complete program with the supplied sample HTML source.
Addition: an actual page may have multiple name-email pairs
Then in the main you need
The desired format with mutpliple Name-Email pairs is: same headers, and for each pair a separate row is printed to file, where all information other than Name-Email is the same.
The printed spreadsheet (URL used was provided in comments)