For everyone who wants to harvest a repository via OAI-PMH this simple Perl program can be a good starting point.
The end-point used in this example leads to the genealogical data of Archive Leiden (Netherlands) which is available as open data (CC0), see the datablog posting on OpenCultureData about this set.
#!/usr/bin/perl
use Net::OAI::Harvester;
use LWP::UserAgent;
#
# This simple Perl program harvests a whole OAI-PMH (A2A)
# repository and stores all requests in separate XML files
# for later processing
#
#
# Configuration
#
# End-point URL of the OAI-PMH API
my $strEndpointUrl = “http://api.memorix-maior.nl/collectiebeheer/a2a/key/dd3f8a56-11e9-11e2-825b-00163e60bf4d/tenant/lei/”;
# Directory where to save the XML files
my $strDir=”./harvested/”;
# Be nice, identify yourself to the repository
my $strAgent=’CoretGenealogie-OAI-PMH-harvest/0.1′;
#
# Main program
#
# Create user agent for identification
my $ua = LWP::UserAgent->new();
$ua->agent($agent);
$ua->timeout(30); ## set timeout to 20 seconds
# Uncomment to see debug information
#$Net::OAI::Harvester::DEBUG = 1;
use Net::OAI::Harvester;
use LWP::UserAgent;
#
# This simple Perl program harvests a whole OAI-PMH (A2A)
# repository and stores all requests in separate XML files
# for later processing
#
#
# Configuration
#
# End-point URL of the OAI-PMH API
my $strEndpointUrl = “http://api.memorix-maior.nl/collectiebeheer/a2a/key/dd3f8a56-11e9-11e2-825b-00163e60bf4d/tenant/lei/”;
# Directory where to save the XML files
my $strDir=”./harvested/”;
# Be nice, identify yourself to the repository
my $strAgent=’CoretGenealogie-OAI-PMH-harvest/0.1′;
#
# Main program
#
# Create user agent for identification
my $ua = LWP::UserAgent->new();
$ua->agent($agent);
$ua->timeout(30); ## set timeout to 20 seconds
# Uncomment to see debug information
#$Net::OAI::Harvester::DEBUG = 1;
# Create the harvester
my $harvester=Net::OAI::Harvester->new(
baseURL=>$strEndpointUrl,userAgent=>$ua);
baseURL=>$strEndpointUrl,userAgent=>$ua);
# List all the records in a repository for specified set,
# adjust metadataPrefix to your need (depends on repository)
# adjust metadataPrefix to your need (depends on repository)
# A2A = Archive to archive = Dutch archive standard for
# genealogical data
# genealogical data
my $records = $harvester->listRecords(
‘metadataPrefix’ => ‘oai_a2a’);
‘metadataPrefix’ => ‘oai_a2a’);
# Resumption token, with bogus initial value to enter loop
my $rToken = “start”;
# Every request gets saved in numbered XML file
my $counter=1;
# Disable bufferd output to live view the counter
$| = 1;
while ( $rToken ) {
# Save the harvested data in a XML file
open(FO,”>”.$strDir.”$counter.xml”)
|| die(“Can’t write file: $!”);
|| die(“Can’t write file: $!”);
print FO $records->xml();
close (FO);
print “$counter “;
$counter++;
# Check if there’s more to harvest
my $rToken = $records->resumptionToken();
if ( $rToken ) {
$records = $harvester->listRecords(
resumptionToken=>$rToken->token());
resumptionToken=>$rToken->token());
}
}
# That’s it!