PLEASE DON'T ASK ME TO DO THIS FOR YOU :-)
THIS IS JUST PROVIDED FOR INFO...
At the University of York we have had an issue with Blogger for years, in that, when a user leaves the university and their account is deleted, although a blog post's text remains, and the blog itself, every blog post's image is LOST because they are stored somewhere in that user's account.
I tried using command-line based scraping tools, like HTTrack to get a blog as static HTML files, but could never quite configure them to get all the content, the local images, the remote images but control the crawler enough not for them to wander off and try and download the entirety of YouTube.
I tried scraping tools.
I tried half a dozen aggregator tools.
I tried process-oriented tools like IFTT. No joy.
I tried writing my own scraper in python and failed. They moved the oAuth goalposts.
I tried using Google Picasa (which they then shut down).
I wanted to maybe use another service, like Flickr to re-host the images but that would mean that anyone using this would need their own FlickAPI account, or I'd have to create a semi-serious service (beyond me).
I tried to figure out the Blogger API and repeatedly fell at the oAuth hurdle until Martin Hawksey helped me out.
So, what this is not a tool to fix up your blog, but it's a description of how I did it.
First
Create a Blogger blog with a different account to the user that is going to leave. Go to Settings > Email and create a SECRET_EMAIL_ADDRESS and set it to publish immediately.
Second
I created a script that has Blogger API access ( no mean feat ), I created a Web App that does this...
var email = "SECRET_EMAIL_TO_YOUR_BLOG@blogger.com"
function doGet(e) {
if (e.parameter.name == "" | e.parameter.name == undefined){
var name = null
}else{
var name = e.parameter.name
}
Logger.log("id: " + e.parameter.id)// The file has to shared with the public of course so this web server can read it.
if (e.parameter.src != "" & e.parameter.src != undefined){ //an image has been provided...
try{
var src = e.parameter.src
Logger.log( "Getting image:" + src )
var blob = UrlFetchApp.fetch(src).getBlob()
if (name == null){
var name = blob.getName()
}
var options = {name: name,attachments: [ blob]}
MailApp.sendEmail(email, name, "", options) //Sends to the secret blog address.
Logger.log( "Mail sent")
Utilities.sleep(4500)
var blog_id = 'YOUR_BLOG_ID' // UoY Image Hosting
var result = get_posts( blog_id )
Logger.log( result )
var post = result.items[0] //Is this the latest one? Will it have got there?
var id = post.id
var author_name = post.author.displayName
var author_id = post.author.id
var title = post.title
var content = post.content
var regex = /<img.+?src=\"(.*?)\".+?>/ig
var images = []
while ( m = regex.exec(content)){ // this gets out the right regex group
images.push( m[1] )
}
Logger.log (images)
//sheet.appendRow( [blog_id, id, author_name, author_id, title, content])
var appData = {
"status": "ok",
"src": images[0],
"postLink": post.url,
"doAnother": "URL TO THIS APP?src="
};
var JSONString = JSON.stringify(appData);
var JSONOutput = ContentService.createTextOutput(JSONString);
JSONOutput.setMimeType(ContentService.MimeType.JSON);
return JSONOutput
}catch(e){
var appData = {" status":"error", "message": e + ": " + e.stack }
var JSONString = JSON.stringify(appData);
var JSONOutput = ContentService.createTextOutput(JSONString);
JSONOutput.setMimeType(ContentService.MimeType.JSON);
return JSONOutput
}
}
//If they haven't supplied a src, then show the info page
// Default Home Page
var template = HtmlService.createTemplateFromFile('info.html');
template.this_url = ScriptApp.getService().getUrl()
return template.evaluate();
}
function get_posts( blog_id ){
var url = 'https://www.googleapis.com/blogger/v3/blogs/' + blog_id + "/posts"
var scope = 'https://www.googleapis.com/auth/blogger';
var name = 'Blogger';
var fetchArgs = googleOAuth_(name,scope);
var blogObj = JSON.parse( UrlFetchApp.fetch(url, fetchArgs).getContentText() )
return blogObj
}
function test_get_posts(){
var blog_id = 'YOUR_BLOG_ID' //
Logger.log( get_posts( blog_id ))
}
var url = 'https://www.googleapis.com/blogger/v3/blogs/' + blog_id + "/posts"
var scope = 'https://www.googleapis.com/auth/blogger';
var name = 'Blogger';
var fetchArgs = googleOAuth_(name,scope);
var blogObj = JSON.parse( UrlFetchApp.fetch(url, fetchArgs).getContentText() )
return blogObj
}
function test_get_posts(){
var blog_id = 'YOUR_BLOG_ID' //
Logger.log( get_posts( blog_id ))
}
So, after publishing it, I now can call this app's URL with ?src=myimageurl.jpg on the end and it will re-host it in another user's account and return some JSON with the new src in it. Handy eh?
Third
So now I can download an XML backup file from Blogger > Settings > Other and add it to my Google Drive.Fourth
Now some scripts can read that file, find the images, and call the API that rehosts the images.
function fixUpBloggerXMLFile(){
var xmlFileID = "LONG DRIVE ID HERE" //Your export xml file
var xmlFile = DriveApp.getFileById(xmlFileID)
var xmlFileName = xmlFile.getName()
var xmlText = xmlFile.getBlob().getDataAsString()
var ss_id = "YOUR SPREADSHEET ID"
var ss = SpreadsheetApp.openById(ss_id)
var sheet = ss.getSheets()[0]
var range = sheet.getRange( "A2:B" + sheet.getLastRow())
var values = range.getValues()
for (v in values){
var row = values[v]
var oldurl = row[0]
var newurl = row[1]
if (newurl.indexOf("http") != -1){
xmlText = xmlText.replace(new RegExp(oldurl, 'g'), newurl);
}
}
DriveApp.createFile("FIXED-" + xmlFileName, xmlText)
}
function processNextImage(){
var ss_id = "YOUR SPREADSHEET ID"
var ss = SpreadsheetApp.openById(ss_id)
var sheet = ss.getSheets()[0]
var range = sheet.getRange( "A2:B" + sheet.getLastRow())
var values = range.getValues()
var i = 2
for (v in values){
var row = values[v]
var oldurl = row[0]
var newurl = row[1]
if (newurl == ""){
var newurl = getAlternativeImage(oldurl)
sheet.getRange(i, 2).setValue( newurl)
sheet.setRowHeight(i, 40)
}
i = i + 1
}
}
function getAlternativeImage(src){
var api = "URL TO YOUR API APP/exec?src="
try{
var url = api + src
var text = UrlFetchApp.fetch(url).getContentText()
var json = JSON.parse(text)
var status = json['status']
var new_src = json['src']
Logger.log( status, new_src)
return new_src
}catch(e){
return e + " " + e.stack
}
}
function getBlogsImages(){
var ss_id = "YOUR SPREADSHEET ID"
var ss = SpreadsheetApp.openById(ss_id)
var sheet = ss.getSheets()[0]
var images = readBloggerExportFile()
for (i in images){
var image = images[i]
sheet.appendRow([image])
}
}
function readBloggerExportFile() {
var xmlFileID = "YOUR BLOGGER EXPORT XML FILE" //phil's
var xmlFile = DriveApp.getFileById(xmlFileID)
var xml = xmlFile.getBlob().getDataAsString()
//Logger.log( xml )
var posts = BloggerExportFile_to_JSON(xmlFileID) /// This should be a list of [id, title, content ]
var allImages = []
var imageCount = 0
for ( p in posts){
var post = posts[p]
var id = post[0]
var title = post[1]
var content = post[2]
var images = findImages(content)
Logger.log( id + " " + content.length + " " + imageCount )
imageCount += images.length
//Logger.log( "---------------------------------------")
allImages = allImages.concat(images)
}
Logger.log( "There are " + imageCount + " images in this blog")
return allImages
}
function findImages(text){
var regex = /<img.+?src=\"(.*?)\".+?>/ig //var regex = /([^\s]+(?=\.(jpg|gif|png))\.\2)/gm; // ALL images including URLs in links
var images = []
while ( m = regex.exec(text)){ // this gets out the right regex group
images.push( m[1] )
}
return images
}
function BloggerExportFile_to_JSON(drive_id) {
var doc = DriveApp.getFileById(drive_id) // This is the XML file you get when you go to Blogger and use Settings > Other > Backup content that has been uploaded to Google Drive.
var xml = doc.getBlob().getDataAsString()
var xmldocument = XmlService.parse(xml);
var items = []
var result = {};
var root = xmldocument.getRootElement();
//Logger.log( root.getName())
result[root.getName()] = elementToJSON(root); //SO much easier to debug and traverse
var entries = result.feed.entry
for (var e in entries){
var entry = entries[ e ]
var id = entry.id.Text
if (id.indexOf("post") > -1){
//It's a blog post and not a setting. Why do Blogger do that? Is it just to re-use the atom format?
var regex = /post-([0-9])*/g;
var found_id = id.match(regex)[0].replace("post-", "") //hack;
items.push( [found_id, entry.title.Text, entry.content.Text])
}
}
Logger.log( "There are " + items.length + " blog posts")
return items;
}
/**
* Converts an XmlService element to a JSON object, using logic similar to
* the sunset method Xml.parse().
* @param {XmlService.Element} element The element to parse.
* @returns {Object} The parsed element.
*/
function elementToJSON(element) {
var result = {};
// Attributes.
element.getAttributes().forEach(function(attribute) {
result[attribute.getName()] = attribute.getValue();
});
// Child elements.
element.getChildren().forEach(function(child) {
var key = child.getName();
var value = elementToJSON(child);
if (result[key]) {
if (!(result[key] instanceof Array)) {
result[key] = [result[key]];
}
result[key].push(value);
} else {
result[key] = value;
}
});
// Text content.
if (element.getText()) {
result['Text'] = element.getText();
}
return result;
}
And generates a new Blogger export XML file that when you import it, doesn't duplicate the posts and replaces them with the images fixed up to be re-hosted on a different blog.
In Conclusion
I've complained (several times) to Google about this, and their response is kind of, "hey, blogging's dead you know", which is a shame.
The only "real" solution is to make a point of loading your images somewhere else first, then just using the URLs to those images, rather than uploading them into the Blogger interface, which is a real pain... but if you want your posts to live longer than your position, then that's the only way as far as I can see. But this method is some sort of potential "fix me up" of sorts.
Addendum... we found that once we'd reimported the xml file that the URLs were now subtly different, being appended with an _ and a random number. Agh!
ReplyDeleteSo after trying to write effectively a Javascript mapping redirect, which didn't work because the 404 page got called, I then turned the 404 page into one that searches for the old url by breaking it into words... like this...
var theURL = window.location.pathname.toString() ;
theURL = theURL.replace(".html", "");
var items = theURL.split("/");
items = items[items.length-1]
console.log( items )
theURL = items.toString()
var words = theURL.split("-");
var string = words.join( " " );
string = string.replace("/", "");
document.write( string );
var theNEWURL = "https://" + window.location.hostname + "/search?q=" + string ;
document.write( theNEWURL );
window.location = theNEWURL;
Try the old URL here:
http://digital-archiving.blogspot.com/2017/11/what-shall-i-do-for-international.html
with a script tag wrapped around it obv.
Delete