SOLUTION: Blogger losing images when a user leaves

PLEASE DON'T ASK ME TO DO THIS FOR YOU :-)

THIS IS JUST PROVIDED FOR INFO...

At the University of York we have had an issue with Blogger for years, in that, when a user leaves the university and their account is deleted, although a blog post's text remains, and the blog itself, every blog post's image is LOST because they are stored somewhere in that user's account.

I tried using command-line based scraping tools, like HTTrack to get a blog as static HTML files, but could never quite configure them to get all the content, the local images, the remote images but control the crawler enough not for them to wander off and try and download the entirety of YouTube.

I tried scraping tools.

I tried half a dozen aggregator tools.

I tried process-oriented tools like IFTT. No joy.

I tried writing my own scraper in python and failed. They moved the oAuth goalposts.

I tried using Google Picasa (which they then shut down).

I wanted to maybe use another service, like Flickr to re-host the images but that would mean that anyone using this would need their own FlickAPI account, or I'd have to create a semi-serious service (beyond me).

I tried to figure out the Blogger API and repeatedly fell at the oAuth hurdle until Martin Hawksey helped me out.

So, what this is not a tool to fix up your blog, but it's a description of how I did it.

First

Create a Blogger blog with a different account to the user that is going to leave. Go to Settings > Email and create a SECRET_EMAIL_ADDRESS and set it to publish immediately.

Second

I created a script that has Blogger API access ( no mean feat ), I created a Web App that does this...

var email = "SECRET_EMAIL_TO_YOUR_BLOG@blogger.com"

function doGet(e) {

if (e.parameter.name == "" | e.parameter.name == undefined){

var name = null

}else{

var name = e.parameter.name

}

Logger.log("id: " + e.parameter.id)// The file has to shared with the public of course so this web server can read it.

if (e.parameter.src != "" & e.parameter.src != undefined){ //an image has been provided...

try{

var src = e.parameter.src

Logger.log( "Getting image:" + src )

var blob = UrlFetchApp.fetch(src).getBlob()

if (name == null){

var name = blob.getName()

}

var options = {name: name,attachments: [ blob]}

MailApp.sendEmail(email, name, "", options) //Sends to the secret blog address.

Logger.log( "Mail sent")

Utilities.sleep(4500)

var blog_id = 'YOUR_BLOG_ID' // UoY Image Hosting

var result = get_posts( blog_id )

Logger.log( result )

var post = result.items[0] //Is this the latest one? Will it have got there?

var id = post.id

var author_name = post.author.displayName

var author_id = post.author.id

var title = post.title

var content = post.content

var regex = /<img.+?src=\"(.*?)\".+?>/ig

var images = []

while ( m = regex.exec(content)){ // this gets out the right regex group

images.push( m[1] )

}

Logger.log (images)

//sheet.appendRow( [blog_id, id, author_name, author_id, title, content])

var appData = {

"status": "ok",

"src": images[0],

"postLink": post.url,

"doAnother": "URL TO THIS APP?src="

};

var JSONString = JSON.stringify(appData);

var JSONOutput = ContentService.createTextOutput(JSONString);

JSONOutput.setMimeType(ContentService.MimeType.JSON);

return JSONOutput

}catch(e){

var appData = {" status":"error", "message": e + ": " + e.stack }

var JSONString = JSON.stringify(appData);

var JSONOutput = ContentService.createTextOutput(JSONString);

JSONOutput.setMimeType(ContentService.MimeType.JSON);

return JSONOutput

}

//If they haven't supplied a src, then show the info page

// Default Home Page

var template = HtmlService.createTemplateFromFile('info.html');

template.this_url = ScriptApp.getService().getUrl()

return template.evaluate();

}

function get_posts( blog_id ){
var url = 'https://www.googleapis.com/blogger/v3/blogs/' + blog_id + "/posts"
var scope = 'https://www.googleapis.com/auth/blogger';
var name = 'Blogger';
var fetchArgs = googleOAuth_(name,scope);

var blogObj = JSON.parse( UrlFetchApp.fetch(url, fetchArgs).getContentText() )
return blogObj
}
function test_get_posts(){
var blog_id = 'YOUR_BLOG_ID' //
Logger.log( get_posts( blog_id ))

}

So, after publishing it, I now can call this app's URL with ?src=myimageurl.jpg on the end and it will re-host it in another user's account and return some JSON with the new src in it. Handy eh?

Third

So now I can download an XML backup file from Blogger > Settings > Other and add it to my Google Drive.

Fourth

Now some scripts can read that file, find the images, and call the API that rehosts the images.

function fixUpBloggerXMLFile(){

var xmlFileID = "LONG DRIVE ID HERE" //Your export xml file

var xmlFile = DriveApp.getFileById(xmlFileID)

var xmlFileName = xmlFile.getName()

var xmlText = xmlFile.getBlob().getDataAsString()

var ss_id = "YOUR SPREADSHEET ID"

var ss = SpreadsheetApp.openById(ss_id)

var sheet = ss.getSheets()[0]

var range = sheet.getRange( "A2:B" + sheet.getLastRow())

var values = range.getValues()

for (v in values){

var row = values[v]

var oldurl = row[0]

var newurl = row[1]

if (newurl.indexOf("http") != -1){

xmlText = xmlText.replace(new RegExp(oldurl, 'g'), newurl);

}

DriveApp.createFile("FIXED-" + xmlFileName, xmlText)

}

function processNextImage(){

var ss_id = "YOUR SPREADSHEET ID"

var ss = SpreadsheetApp.openById(ss_id)

var sheet = ss.getSheets()[0]

var range = sheet.getRange( "A2:B" + sheet.getLastRow())

var values = range.getValues()

var i = 2

for (v in values){

var row = values[v]

var oldurl = row[0]

var newurl = row[1]

if (newurl == ""){

var newurl = getAlternativeImage(oldurl)

sheet.getRange(i, 2).setValue( newurl)

sheet.setRowHeight(i, 40)

}

i = i + 1

}

function getAlternativeImage(src){

var api = "URL TO YOUR API APP/exec?src="

try{

var url = api + src

var text = UrlFetchApp.fetch(url).getContentText()

var json = JSON.parse(text)

var status = json['status']

var new_src = json['src']

Logger.log( status, new_src)

return new_src

}catch(e){

return e + " " + e.stack

}

function getBlogsImages(){

var ss_id = "YOUR SPREADSHEET ID"

var ss = SpreadsheetApp.openById(ss_id)

var sheet = ss.getSheets()[0]

var images = readBloggerExportFile()

for (i in images){

var image = images[i]

sheet.appendRow([image])

}

function readBloggerExportFile() {

var xmlFileID = "YOUR BLOGGER EXPORT XML FILE" //phil's

var xmlFile = DriveApp.getFileById(xmlFileID)

var xml = xmlFile.getBlob().getDataAsString()

//Logger.log( xml )

var posts = BloggerExportFile_to_JSON(xmlFileID) /// This should be a list of [id, title, content ]

var allImages = []

var imageCount = 0

for ( p in posts){

var post = posts[p]

var id = post[0]

var title = post[1]

var content = post[2]

var images = findImages(content)

Logger.log( id + " " + content.length + " " + imageCount )

imageCount += images.length

//Logger.log( "---------------------------------------")

allImages = allImages.concat(images)

}

Logger.log( "There are " + imageCount + " images in this blog")

return allImages

}

function findImages(text){

var regex = /<img.+?src=\"(.*?)\".+?>/ig //var regex = /([^\s]+(?=\.(jpg|gif|png))\.\2)/gm; // ALL images including URLs in links

var images = []

while ( m = regex.exec(text)){ // this gets out the right regex group

images.push( m[1] )

}

return images

}

function BloggerExportFile_to_JSON(drive_id) {

var doc = DriveApp.getFileById(drive_id) // This is the XML file you get when you go to Blogger and use Settings > Other > Backup content that has been uploaded to Google Drive.

var xml = doc.getBlob().getDataAsString()

var xmldocument = XmlService.parse(xml);

var items = []

var result = {};

var root = xmldocument.getRootElement();

//Logger.log( root.getName())

result[root.getName()] = elementToJSON(root); //SO much easier to debug and traverse

var entries = result.feed.entry

for (var e in entries){

var entry = entries[ e ]

var id = entry.id.Text

if (id.indexOf("post") > -1){

//It's a blog post and not a setting. Why do Blogger do that? Is it just to re-use the atom format?

var regex = /post-([0-9])*/g;

var found_id = id.match(regex)[0].replace("post-", "") //hack;

items.push( [found_id, entry.title.Text, entry.content.Text])

}

Logger.log( "There are " + items.length + " blog posts")

return items;

}

/**

* Converts an XmlService element to a JSON object, using logic similar to

* the sunset method Xml.parse().

* @param {XmlService.Element} element The element to parse.

* @returns {Object} The parsed element.

function elementToJSON(element) {

var result = {};

// Attributes.

element.getAttributes().forEach(function(attribute) {

result[attribute.getName()] = attribute.getValue();

});

// Child elements.

element.getChildren().forEach(function(child) {

var key = child.getName();

var value = elementToJSON(child);

if (result[key]) {

if (!(result[key] instanceof Array)) {

result[key] = [result[key]];

}

result[key].push(value);

} else {

result[key] = value;

}

});

// Text content.

if (element.getText()) {

result['Text'] = element.getText();

}

return result;

}

If you add this formula, =IMAGE(A2, 4, 40, 50) it creates a spreadsheet that looks like this... with the old url to the image and the new image's url.

And generates a new Blogger export XML file that when you import it, doesn't duplicate the posts and replaces them with the images fixed up to be re-hosted on a different blog.

In Conclusion

I've complained (several times) to Google about this, and their response is kind of, "hey, blogging's dead you know", which is a shame.

The only "real" solution is to make a point of loading your images somewhere else first, then just using the URLs to those images, rather than uploading them into the Blogger interface, which is a real pain... but if you want your posts to live longer than your position, then that's the only way as far as I can see. But this method is some sort of potential "fix me up" of sorts.

Inserting A Google Doc link into a Google Spreadsheet (UPDATED 6/12/2017)

This article looks at using Apps Script to add new features to a Google Spreadsheet. At the University of York, various people have been using Google spreadsheets to collect together various project related information. We've found that when collecting lots of different collaborative information from lots of different people that a spreadsheet can work much better than a regular Google Form. Spreadsheets can be better than Forms for data collection because: The spreadsheet data saves as you are editing. If you want to fill in half the data and come back later, your data will still be there. The data in a spreadsheet is versioned, so you can see who added what and when and undo it if necessary The commenting features are brilliant - especially the "Resolve" button in comments. One feature we needed was to be able to "attach" Google Docs to certain cells in a spreadsheet. It's easy to just paste in a URL into a spreadsheet cell, but they can often...

Tom Smith13 November 2018 at 11:02
Addendum... we found that once we'd reimported the xml file that the URLs were now subtly different, being appended with an _ and a random number. Agh!

So after trying to write effectively a Javascript mapping redirect, which didn't work because the 404 page got called, I then turned the 404 page into one that searches for the old url by breaking it into words... like this...

var theURL = window.location.pathname.toString() ;
theURL = theURL.replace(".html", "");
var items = theURL.split("/");
items = items[items.length-1]
console.log( items )
theURL = items.toString()
var words = theURL.split("-");
var string = words.join( " " );
string = string.replace("/", "");
document.write( string );

var theNEWURL = "https://" + window.location.hostname + "/search?q=" + string ;
document.write( theNEWURL );
window.location = theNEWURL;

Try the old URL here:
http://digital-archiving.blogspot.com/2017/11/what-shall-i-do-for-international.html

Collaborative Tools Project

Search This Blog