Push & Pull : Scraped Data into Index and back

With many scrapers being integrated into the Loklak server, it is but natural that the load on the server would also increase if multitude of requests are to be served each millisecond.

Initially, when Loklak only harvested tweets from Twitter, Elasticsearch was implemented along-with a Data Access Object to do the needful task of indexing.

The JSON Object(s) pushed into the index were of the form statuses and had to be in a specific format to be shoved and retrieved easily in the index.

Sample:


{
  "statuses": [
    {
      "id_str": "yourmessageid_1234",
      "screen_name": "testuser",
      "created_at": "2016-07-22T07:53:24.000Z",
      "text": "The rain is spain stays always in the plain",
      "source_type": "GENERIC",
      "place_name": "Georgia, USA",
      "location_point": [
        3.058579854228782,
        50.63296878274201
      ],
      "location_radius": 0,
      "user": {
        "user_id": "youruserid_5678",
        "name": "Mr. Bob",
        
      }
    }
  ]
}

But with the inclusion of many other scrapers like Github, WordPress, Event Brite etc. and RSS Readers it was a bit cumbersome to use the exact same format as that of Twitter because not all fields matched.

For example:


{
  "data": [
    {
      "location": "Canada - Ontario - London",
      "time": "Sun 9:33 PM"
    },
    {
      "location": "South Africa - East London",
      "time": "Mon 3:33 AM"
    }
  ]
}

Hence, Scott suggested an idea of implementing a DAO Wrapper which would enable us to use the same schema as that of Twitter Index to push and pull data.

DAO Wrapper was implemented as GenericJSONBuilder which had the feature of adding the remaining fields of data other than the text into the same schema using Markdown Format

Peeking into the code:


package org.loklak.data;

import javafx.util.Pair;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.QueryEntry;
import org.loklak.objects.SourceType;
import org.loklak.objects.UserEntry;

import java.net.MalformedURLException;
import java.util.*;

/**
 * The json below is the minimum json
 * {
 "statuses": [
 {
 "id_str": "yourmessageid_1234",
 "screen_name": "testuser",
 "created_at": "2016-07-22T07:53:24.000Z",
 "text": "The rain is spain stays always in the plain",
 "source_type": "GENERIC",
 "place_name": "Georgia, USA",
 "location_point": [3.058579854228782,50.63296878274201],
 "location_radius": 0,
 "user": {
 "user_id": "youruserid_5678",
 "name": "Mr. Bob",
 }
 }
 ]
 }
 */
public class DAOWrapper {
    public static final class GenericJSONBuilder{
        private String id_str = null;
        private String screen_name = "unknown";
        private Date created_at = null;
        private String text = "";
        private String place_name = "unknown";
        private String user_name = "[email protected]";
        private String user_id = "unknown";
        private String image = null;
        private double lng = 0.0;
        private double lat = 0.0;
        private int loc_radius = 0;
        private ArrayList extras = new ArrayList();


        /**
         * Not required
         * @param author
         * @param domain
         * @return
         */
        public GenericJSONBuilder setAuthor(String author, String domain){
            user_name = author + "@" + domain;
            screen_name = author;
            return this;
        }

        /**
         * Not required
         * @param user_id_
         * @return
         */
        public GenericJSONBuilder setUserid(String user_id_){
            user_id = user_id_;
            return this;
        }

        /**
         * Not required
         * @param id_str_
         * @return
         */
        public GenericJSONBuilder setIDstr(String id_str_){
            id_str = id_str_;
            return this;
        }

        /**
         * Not required
         * @param createdTime
         * @return
         */
        public GenericJSONBuilder setCreatedTime(Date createdTime){
            created_at = createdTime;
            return this;
        }

        /**
         * Required
         * This is the text field. You can use JSON style in this field
         * @param text_
         * @return
         */
        public GenericJSONBuilder addText(String text_){
            text = text + text_;
            return this;
        }

        /**
         * Not required
         * @param name
         * @return
         */
        public GenericJSONBuilder setPlaceName(String name){
            place_name = name;
            return this;
        }

        /**
         * Not required
         * @param longtitude
         * @param latitude
         * @return
         */
        public GenericJSONBuilder setCoordinate(double longtitude, double latitude){
            lng = longtitude;
            lat = latitude;
            return this;
        }

        /**
         * Not required
         * @param radius
         * @return
         */
        public GenericJSONBuilder setCoordinateRadius(int radius){
            loc_radius = radius;
            return this;
        }


        /**
         * Not required
         * @param key
         * @param value
         * @return
         */
        public GenericJSONBuilder addField(String key, String value){
            String pair_string = "\"" + key + "\": \"" + value + "\"";
            extras.add(pair_string);
            return this;
        }

        private String buildFieldJSON(){
            String extra_json = "";
            for(String e:extras){
                extra_json =  extra_json + e + ",";
            }
            if(extra_json.length() > 2) extra_json = "{" + extra_json.substring(0, extra_json.length() -1) + "}";
            return extra_json;
        }

        /**
         * Not required
         * @param link_
         * @return
         */
        public GenericJSONBuilder setImage(String link_){
            image = link_;
            return this;
        }

        public void persist(){
            try{
                //building message entry
                MessageEntry message = new MessageEntry();

                /**
                 * Use hash of text if id of message is not set
                 */
                if(id_str == null)
                    id_str = String.valueOf(text.hashCode());

                message.setIdStr(id_str);

                /**
                 * Get current time if not set
                 */
                if(created_at == null)
                    created_at = new Date();
                message.setCreatedAt(created_at);


                /**
                 * Append the field as JSON text
                 */
                message.setText(text + buildFieldJSON());

                double[] locPoint = new double[2];
                locPoint[0] = lng;
                locPoint[1] = lat;

                message.setLocationPoint(locPoint);

                message.setLocationRadius(loc_radius);

                message.setPlaceName(place_name, QueryEntry.PlaceContext.ABOUT);
                message.setSourceType(SourceType.GENERIC);

                /**
                 * Insert if there is a image field
                 */
                if(image != null) message.setImages(image);

                //building user
                UserEntry user = new UserEntry(user_id, screen_name, "", user_name);

                //build message and user wrapper
                DAO.MessageWrapper wrapper = new DAO.MessageWrapper(message,user, true);

                DAO.writeMessage(wrapper);
            } catch (MalformedURLException e){
            }
        }
    }





    public static GenericJSONBuilder builder(){
        return new GenericJSONBuilder();
    }





    public static void insert(Insertable msg){

        GenericJSONBuilder bd = builder()
        .setAuthor(msg.getUsername(), msg.getDomain())
        .addText(msg.getText())
        .setUserid(msg.getUserID());


        /**
         * Insert the fields
         */
        List<Pair<String, String>> fields = msg.getExtraField();
        for(Pair<String, String> field:fields){
            bd.addField(field.getKey(), field.getValue());
        }
    }
}

DAOWrapper was then used with other scrappers to push the data into the index as:


...
DAOWrapper dw = new DAOWrapper();
dw.builder().addText(json.toString());
dw.builder().setUserid("profile_"+profile);
dw.builder().persist();
...

Here , addText(...) can be used several times to insert text in the object but set...(...) methods should be used only once and perist() should also be used only once as this is the method which finally pushes into the index.

Now, when a scraper receives a request to scrape a given HTML page, a check is first made if the data already exists in the index with the help of a unique userIDString. This saves the time and effort of scraping the page all over again, instead it simply returns the saved instance.

The check is done something like this:


if(DAO.existUser("profile_"+profile)){
    /*
     *  Return existing JSON Data
    */
}else{
    /*
     *  Scrape the HTML Page addressed by the given URL
    */
}

This pushing and pulling into the index would certainly reduce the load on the Loklak server.

Feel free to ask questions regarding the above.

Feedback and Suggestions welcome 🙂

Push & Pull : Scraped Data into Index and back

The Making of the Console Service

SUSI , our very own personal digital assistant has been up and running giving quirky answers.

But behind all these are rules which train our cute bot to assist her and decide what answers to provide after parsing the question asked by the users.

The questions could range from any formal-informal greetings, general queries about name, weather, date, time to specific ones like details about some random Github profile or Tweets and Replies  from Twitter, or Weibo or election/football score predictions or simply asking her to read a RSS feed or a WordPress blog for you.

The rules for her training are written after that specific service is implemented which shall help her fetch the particular website/social network in question and scrape data out of it to present it to her operator.

And to help us expand the scope and ability of this naive being, it shall be helpful if users could extend her rule set. For this, it is required to make console service for sites which do not provide access to information without OAuth.

To begin with, let us see how a console service can be made.

Starting with a SampleService class which shall basically include the rudimentary scraper or code fetching the data is defined in the package org.loklak.api.search.
This is made by extending the AbstractAPIHandler class which itself extends the javax.servlet.http.HttpServlet class.
SampleService class further implements APIHandler class.

A placeholder for SampleService class can be as:


package org.loklak.api.search;

/**
* import statements
**/

public class SampleService extends AbstractAPIHandler 
    implements APIHandler{

    private static final long serialVersionUID = 2142441326498450416L;
    /**
     * serialVersionUID could be 
     * auto-generated by the IDE used
    **/

    @Override
    public String getAPIPath() {
        return "/api/service.json";
        /**
         *Choose API path for the service in question
        **/
    }

    @Override
    public BaseUserRole getMinimalBaseUserRole() {
        return BaseUserRole.ANONYMOUS;
    }

    @Override
    public JSONObject getDefaultPermissions(BaseUserRole baseUserRole) {
        return null;
    }

    @Override
    public JSONObject serviceImpl(Query call, HttpServletResponse response, 
        Authorization rights, JSONObjectWithDefault permissions) 
        throws APIException {

        String url = call.get("url", "");
        /**
         *This would extract the argument that will be supplied
         * to the "url" parameter in the "call"
        **/
        return crawlerForService(url);

    }

    public SusiThought crawlerForService(String url) {
        JSONArray arr = new JSONArray();
        
        /**
         * Crawler code or any other function which
         * returns a JSON Array **arr** goes in here 
        **/

        SusiThought json = new SusiThought();
        json.setData(arr);
        return json;
    }

}

 

The JSONArray in the key function crawlerForService is wrapped up in a SusiThought which is nothing but a piece of data that can be remembered. The structure or the thought can be modeled as a table which may be created using the retrieval of information from elsewhere of the current argument.

Now to implement it as a Console Service we include it in the ConsoleService class which is defined in the same package org.loklak.api.search and similarly extends AbstractAPIHandler class and implements APIHandler class.

Here, dbAccess is a static variable of the type SusiSkills where a skill is defined as the ability to inspire, to create thoughts from perception. The data structure of a skill set is a mapping from perception patterns to lambda expressions which induce thoughts.


package org.loklak.api.search;

/**
 * import statements go here
**/

public class ConsoleService extends AbstractAPIHandler 
    implements APIHandler {

    private static final long serialVersionUID = 8578478303032749879L;
    /**
     * serialVersionUID could be 
     * auto-generated by the IDE used
    **/

    @Override
    public BaseUserRole getMinimalBaseUserRole() { 
        return BaseUserRole.ANONYMOUS; 
    }

    @Override
    public JSONObject getDefaultPermissions(BaseUserRole baseUserRole) {
        return null;
    }

    public String getAPIPath() {
        return "/api/console.json";
    }

    public final static SusiSkills dbAccess = new SusiSkills();
        static {

            /**
             * Other "skills" are defined here
             * by "putting" them in "dbAccess"
            **/
    
    dbAccess.put(Pattern.compile("SELECT\\h+?(.*?)\\h+?FROM\\h+?
        sampleservice\\h+?WHERE\\h+?url\\h??=\\h??'(.*?)'\\h??;"), 
            (flow, matcher) -> {
                /**
                 * SusiThought-s are fetched from the Services
                 * implemented as above
                **/
                SusiThought json = SampleService.crawlerForService(matcher.group(2));
                SusiTransfer transfer = new SusiTransfer(matcher.group(1));
                json.setData(transfer.conclude(json.getData()));
                return json;
                });
    }

    @Override
    public JSONObject serviceImpl(Query post, HttpServletResponse response, 
        Authorization rights, final JSONObjectWithDefault permissions) 
        throws APIException {

            String q = post.get("q", "");
            /**
             *This would extract the argument that will be supplied
             * to the "q" parameter in the "post" query
            **/
            

            return dbAccess.inspire(q);
        }

}

 

Now that the console service is made, an API endpoint for the same can correspond to: http://localhost:9000/api/console.json?q=SELECT * FROM sampleservice WHERE url = ‘ … ‘;

The above can serve as a placeholder for creating Console Service which shall enable SUSI widen her horizon and become intelligent.

So, Go ahead and make Susi rules using it and you are done !

If any aid is required in making SUSI Rules, stay tuned for the next post.

Come, contribute to Loklak and SUSI !

The Making of the Console Service

Spin-Off: Loklak fuels Open Event

Continuing with the Loklak & Open Event Partnership (check out Loklak fuels Open Event ), we can now easily in clicks create our very own web-app for the event with details imported from eventbrite.com powered by Loklak.

The scraping of data done using JSoup, Java HTML parsing was explained in the previous post of this series.

Next, a console service was implemented as the single point for information retrieval from various social networks and websites (a post coming for it soon 😉 ) especially for SUSI (our very own personal digital assistant, a cute one indeed !)

The JSONArray result of the EventBriteCrawler was set in SusiThought, which is nothing but a piece of data that can be remembered. The structure or the thought can be modelled as a table which may be created using the retrieval of information from elsewhere of the current argument.


/** Defining SusiThought as a class 
 * which extends JSONObject
 */

public class SusiThought extends JSONObject {

/* details coming soon.... */

}

/** Modifications in EventBriteCrawler
 *  Returning SusiThought instead of 
 * a simple JSONObject/JSONArray.
 */
public static SusiThought crawlEventBrite(String url) {
    ...
    ...    
    SusiThought json = new SusiThought();
    json.setData(jsonArray);
    return json;
}

 

The API EndPoint was thus created.
A sample is as: http://loklak.org/api/console.json?q=SELECT * FROM eventbrite WHERE url=’https://www.eventbrite.fr/e/billets-europeade-2016-concert-de-musique-vocale-25592599153′;

Screenshot from 2016-07-15 13:22:00

 

The files generated were next imported in the Open Event Web App generator, using simple steps.

screenshot-from-2016-07-04-075700

Screenshot from 2016-07-15 13:25:39 Screenshot from 2016-07-15 13:36:19

It’s amazing to see how a great visual platform is provided to edit details parsed from the plain JSONObject and deploy the personalized web-app !

Screenshot from 2016-07-15 13:59:47

Screenshot from 2016-07-15 12:55:06Screenshot from 2016-07-15 12:55:18Screenshot from 2016-07-15 12:55:29Screenshot from 2016-07-15 12:54:24
Screenshot from 2016-07-15 12:54:33
Tadaa !
We have our very own event web-app with all the information imported from eventbrite.com in a single (well, very few 😛 ) click (s) !

With this, we conclude the Loklak – Open Event – EventBrite series.

Stay tuned for detailed post on SUSI and Console Services 🙂

Spin-Off: Loklak fuels Open Event

TopMenu and SiteMaps – Making loklak crawlable

So now we have seen how loklak_depot actually started off: by an accounts system and lot of security fixes (AAA system etc). We have made the foundation of loklak_depot as simple and branched-out as possible. But before we go on to working on Q&A apps and Susi (our intelligent query system of loklak_depot), I figured out one problem.

How do the users on the WWW get to know about this?

loklak had not been made crawlable until recently. This prevented search engines to crawl loklak.org and display its results. To improve out reach, thus, enabling crawling became necessary.

To enable crawling, what we needed was a sitemap.xml file and a robots.txt. The sitemap specifies the URLs branching out from the main page (including the main page itself) and the robots.txt mainly specifies parts of the site which should NOT be crawled. Thus, both had to be made to enable crawling.

Talking about the main loklak.org website, if you visit the site, you will see a menu on the top which leads to the various links (lets refer to it as the TopMenu). Once the user knows that these links are there, it will automatically crawl them. So it could be simple to create a normal xml file which has those links. But here’s the catch.

We knew loklak.org was something all of us are working on (and updating) regularly, and so the TopMenu is also bound to change. We also did not want to keep updating the HTML files to accommodate changes in the TopMenu. So we decided to do two things:

1. Make the TopMenu dynamic so that only a little change can update it.
2. Generate and update the sitemap.xml dynamically from TopMenu changes, without changing the xml.

For Part 1, we decided to implement a servlet which returns a JSON containing the TopMenu items and their links. We then implement an Angular function which parses this JSON and changes the TopMenu dynamically.

Here is the servlet TopMenuService.java. It’s pretty easy to understand:


public class TopMenuService extends AbstractAPIHandler implements APIHandler {
    
    private static final long serialVersionUID = 1839868262296635665L;

    @Override
    public BaseUserRole getMinimalBaseUserRole() { return BaseUserRole.ANONYMOUS; }

    @Override
    public JSONObject getDefaultPermissions(BaseUserRole baseUserRole) {
        return null;
    }

    @Override
    public String getAPIPath() {
        return "/cms/topmenu.json";
    }
    
    @Override
    public JSONObject serviceImpl(Query call, Authorization rights, final JSONObjectWithDefault permissions) {
        
        int limited_count = (int) DAO.getConfig("download.limited.count", (long) Integer.MAX_VALUE);
    
        JSONObject json = new JSONObject(true);
        JSONArray topmenu = new JSONArray()
            .put(new JSONObject().put("Home", "index.html"))
            .put(new JSONObject().put("About", "about.html"))
            .put(new JSONObject().put("Showcase", "showcase.html"))
            .put(new JSONObject().put("Architecture", "architecture.html"))
            .put(new JSONObject().put("Download", "download.html"))
            .put(new JSONObject().put("Tutorials", "tutorials.html"))
            .put(new JSONObject().put("API", "api.html"));
        if (limited_count > 0) topmenu.put(new JSONObject().put("Dumps", "dump.html"));
        topmenu.put(new JSONObject().put("Apps", "apps/applist/index.html"));
        json.put("items", topmenu);

    }
}

As seen, in a serviceImpl object, we are making a JSONObject containing all the links of loklak.org TopMenu with their URLs, and this object is returned.

Now what we want is make the changes to the index.html and the JavaScript, and here they are:

JS:



angular.element(document).ready(function () {
  var navString = "";
  var winLocation = window.location.href;
  $.getJSON("/cms/topmenu.json", function(data) {
    navItems = data.items;
    navItems = navItems.reverse();
    var count = 0;
    $.each( navItems, function(index, itemData) {
      name = Object.keys(itemData);
      link = itemData[name];
      // Now construct the li items
      liItem = "<li>";
      if (winLocation.indexOf(link) != -1 && count != 1) {
        liItem = "<li class='active'>";
        count = count + 1;
      }
      liItem += "<a href='\/"+link+"'>"+name+"</a></li>";
      liItem = $(liItem);
      $('#navbar > ul').prepend(liItem);
    });
  });
});

HTML:



<nav class="navbar navbar-inverse navbar-fixed-top">
      <div class="container-fluid">
        <div class="navbar-header">
          <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
            <span class="sr-only">Toggle navigation</span>
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
          </button>
          <a class="navbar-brand" href="#"></a>
        </div>
        <div id="navbar" class="navbar-collapse collapse">
          <ul class="nav navbar-nav navbar-right">
            <!-- This will get populated -->
          </ul>
        </div>
      </div>
    </nav>

So in the Angular function, we parse the JSON and insert the items in the TopMenu code in the HTML. So basically all we need to do is change the entries in TopMenuService.java and the TopMenu will get updated.

So this is Part 1 done. Now comes the crawling part. We need to use TopMenuService.java in a servlet so that only changing the entries in TopMenuService.java will change the sitemap. So basically TopMenuService is the central servlet, changing it should update both sitemap and the TopMenu URLs as shown above.

So I coded another servlet which parses the JSON from TopMenu and makes up a SiteMap:


public class Sitemap extends HttpServlet {

	private static final long serialVersionUID = -8475570405765656976L;
	private final String sitemaphead = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
			+ "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n";

	@Override
	protected void doPost(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		doGet(request, response);
	}

	@Override
	protected void doGet(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		Query post = RemoteAccess.evaluate(request);
		// String siteurl = request.getRequestURL().toString();
		// String baseurl = siteurl.substring(0, siteurl.length() -
		// request.getRequestURI().length()) + request.getContextPath() + "/";
		String baseurl = "http://loklak.org/";
		JSONObject TopMenuJsonObject = new TopMenuService().serviceImpl(post, null, null);
		JSONArray sitesarr = TopMenuJsonObject.getJSONArray("items");
		response.setCharacterEncoding("UTF-8");
		PrintWriter sos = response.getWriter();
		sos.print(sitemaphead + "\n");
		for (int i = 0; i < sitesarr.length(); i++) {
			JSONObject sitesobj = sitesarr.getJSONObject(i);
			Iterator sites = sitesobj.keys();
			sos.print("<url>\n<loc>" + baseurl + sitesobj.getString(sites.next().toString()) + "/</loc>\n"
					+ "<changefreq>weekly</changefreq>\n</url>\n");
		}
		sos.print("</urlset>");
		sos.println();
		post.finalize();
	}
}

The XML is adhering to the sitemap standard as prescribed here. Basically, I just took up the JSON from TopMenu, used an Iterator to get the keys (if you look at the JSON, you will notice I only need the values from all the objects in the JSONArray). and then print it out using a PrintWriter.

Since we wanted all the URLs to be crawled in the sitemap, the robots.txt looks something like:


User-agent: *
Sitemap: http://loklak.org/api/sitemap.xml

So now we have achieved in getting a dynamically updating SiteMap and TopMenu, all controlled using only a JSONObject in TopMenuService.java. Easy, no?

That’s all for now. In my next post, I will be talking about the Q&A Apps I’m working on, as well as a bit about Susi. Till then, ciao! Feedback as always is appreciated 🙂

TopMenu and SiteMaps – Making loklak crawlable

Loklak getting the followers from Weibo

Like twitter loklak has started to scrap the Weibo data. Sina Weibo is a Chinese microblogging (weibo) website. Akin to a hybrid of Twitter and Facebook, it is one of the most popular sites in China, in use by well over 30% of Internet users, with a market penetration similar to the United States’ Twitter.

I have started to scrape the user’s bio page which looks something like this.

Selection_179

The above image is a user’s profile on Weibo. It has two frames of profile, one is a user’s bio which is similar to facebook’s bio page. The second one is completely similar to the twitter’s format. So i scraped the user’s followers details which was in the form of table. Using JSoup we can scrap tables easily.

Selection_180

Selection_181

This is how the table on the profile page got scraped and can get the followers data from Weibo. Stay tuned for more scraping updates.

Loklak getting the followers from Weibo

Convert web pages into structured data

Loklak provides a new API which converts web pages into structured data in JSON. The genericscraper API helps you to scrape any web page from a given URL and provides you with the structured JSON data. Just place the URL in the given format http://localhost:9000/api/genericscraper.json?url=http://www.google.com

This scrapes generic data from a given web page URL, for instance this is the current URL after scraping the main Google search page.

{
  "Text in Links": [
    "Images",
    "Maps",
    "Play",
    "YouTube",
    "News",
    "Gmail",
    "Drive",
    "More »",
    "Web History",
    "Settings",
    "Sign in",
    "Advanced search",
    "Language tools",
    "हिन्दी",
    "বাংলা",
    "తెలుగు",
    "मराठी",
    "தமிழ்",
    "ગુજરાતી",
    "ಕನ್ನಡ",
    "മലയാളം",
    "ਪੰਜਾਬੀ",
    "Advertising Programs",
    "Business Solutions",
    "+Google",
    "About Google",
    "Google.com",
    "Privacy",
    "Terms"
  ],
  "Image files": [],
  "source files": [],
  "Links": [
    "http://www.google.co.in/imghp?hl=en&tab=wi",
    "http://maps.google.co.in/maps?hl=en&tab=wl",
    "https://play.google.com/?hl=en&tab=w8",
    "http://www.youtube.com/?gl=IN&tab=w1",
    "http://news.google.co.in/nwshp?hl=en&tab=wn",
    "https://mail.google.com/mail/?tab=wm",
    "https://drive.google.com/?tab=wo",
    "https://www.google.co.in/intl/en/options/",
    "http://www.google.co.in/history/optout?hl=en",
    "/preferences?hl=en",
    "https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://www.google.co.in/%3Fgfe_rd%3Dcr%26ei%3DR_xpV6G9M-PA8gfis7rIDA",
    "/advanced_search?hl=en-IN&authuser=0",
    "/language_tools?hl=en-IN&authuser=0",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=hi&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=bn&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=te&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=mr&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=ta&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=gu&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=kn&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=ml&source=homepage",
    "http://www.google.co.in/setprefs?sig=0_VODpnfQFFvCo-TLhn2_Kr9sRC2c%3D&hl=pa&source=homepage",
    "/intl/en/ads/",
    "http://www.google.co.in/services/",
    "https://plus.google.com/104205742743787718296",
    "/intl/en/about.html",
    "http://www.google.co.in/setprefdomain?prefdom=US&sig=__SF4cV2qKAyiHu9OKv2V_rNxesko%3D",
    "/intl/en/policies/privacy/",
    "/intl/en/policies/terms/",
    "/images/branding/product/ico/googleg_lodp.ico"
  ],
  "language": "en-IN",
  "title": "Google",
  "Script Files": []
}

I wrote a generic scraper using the popular HTML scraper Java library JSoup. I scraped the generic fields like title, images, links, source files and other text between links. After the generic scraper was ready I registered the API endpoint as api/genericscraper.json along with the servlet.

Selection_128

I loaded the page from the given URL and took the value from the variable url. I scraped every tag by using getElementByTag. After storing the elements, I looped through the list and retrieved the attributes from each tag. I stored the data accordingly and pushed into the JSONArray. After the necessary scraping I pushed the JSON Arrays into a JSON object and pretty printed it.

Selection_129

There is an app consuming the above API called WebScraper under the Loklak apps page.

Selection_122.png

Convert web pages into structured data

Now get wordpress blog updates with Loklak !

Loklak shall soon be spoiling its users !

Next, it will be bringing in tiny tweet-like cards showing the blog-posts (title, publishing date, author and content) from the given WordPress Blog URL.

This feature is certain to expand the realm of Loklak’s missive of building a comprehensive and an extensive social network dispensing useful information.

Screenshot from 2016-06-22 04:48:28

In order to implement this feature, I have again made the use of JSoup: The Java HTML parser library as it provides a very convenient API for extracting and manipulating data, scrape and parse HTML from a URL.

The information is scraped making use of JSoup after the corresponding URL in the format "https://[username].wordpress.com/" is passed as an argument to the function scrapeWordpress(String blogURL){..} which returns a JSONObject as the result.

A look at the code snippet :

/**
 *  WordPress Blog Scraper
 *  By Jigyasa Grover, @jig08
 **/

package org.loklak.harvester;

import java.io.IOException;

import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class WordPressBlogScraper {
	public static void main(String args[]){
		
		String blogURL = "https://loklaknet.wordpress.com/";
		scrapeWordpress(blogURL);		
	}
	
	public static JSONObject scrapeWordpress(String blogURL) {
		
                Document blogHTML = null;
		
		Elements articles = null;
		Elements articleList_title = null;
		Elements articleList_content = null;
		Elements articleList_dateTime = null;
		Elements articleList_author = null;

		String[][] blogPosts = new String[100][4];
		
		//blogPosts[][0] = Blog Title
		//blogPosts[][1] = Posted On
		//blogPosts[][2] = Author
		//blogPosts[][3] = Blog Content
		
		Integer numberOfBlogs = 0;
		Integer iterator = 0;
		
		try{			
			blogHTML = Jsoup.connect(blogURL).get();
		}catch (IOException e) {
            e.printStackTrace();
        }
			
			articles = blogHTML.getElementsByTag("article");
			
			iterator = 0;
			for(Element article : articles){
				
				articleList_title = article.getElementsByClass("entry-title");				
				for(Element blogs : articleList_title){
					blogPosts[iterator][0] = blogs.text().toString();
				}
				
				articleList_dateTime = article.getElementsByClass("posted-on");				
				for(Element blogs : articleList_dateTime){
					blogPosts[iterator][1] = blogs.text().toString();
				}
				
				articleList_author = article.getElementsByClass("byline");				
				for(Element blogs : articleList_author){
					blogPosts[iterator][2] = blogs.text().toString();
				}
				
				articleList_content = article.getElementsByClass("entry-content");				
				for(Element blogs : articleList_content){
					blogPosts[iterator][3] = blogs.text().toString();
				}
				
				iterator++;
				
			}
			
			numberOfBlogs = iterator;
			
			JSONArray blog = new JSONArray();
			
			for(int k = 0; k<numberOfBlogs; k++){
				JSONObject blogpost = new JSONObject();
				blogpost.put("blog_url", blogURL);
				blogpost.put("title", blogPosts[k][0]);
				blogpost.put("posted_on", blogPosts[k][1]);
				blogpost.put("author", blogPosts[k][2]);
				blogpost.put("content", blogPosts[k][3]);
				blog.put(blogpost);
			}			
			
			JSONObject final_blog_info = new JSONObject();
			
			final_blog_info.put("Wordpress blog: " + blogURL, blog);			

			System.out.println(final_blog_info);
			
			return final_blog_info;
		
	}
}

 

In this, simply a HTTP Connection was established and text extracted using “element_name”.text() from inside the specific tags using identifiers like classes or ids. The tags from which the information was to be extracted were identified after exploring the web page’s HTML source code.

The result thus obtained is in the form of a JSON Object

{
  "Wordpress blog: https://loklaknet.wordpress.com/": [
    {
      "posted_on": "June 19, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "shivenmian",
      "title": "loklak_depot u2013 The Beginning: Accounts (Part 3)",
      "content": "So this is my third post in this five part series on loklak_depo... As always, feedback is duly welcome."
    },
    {
      "posted_on": "June 19, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "sopankhosla",
      "title": "Creating a Loklak App!",
      "content": "Hello everyone! Today I will be shifting from course a...ore info refer to the full documentation here. Happy Coding!!!"
    },
    {
      "posted_on": "June 17, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "leonmakk",
      "title": "Loklak Walls Manual Moderation u2013 tweet storage",
      "content": "Loklak walls are going to....Stay tuned for more updates on this new feature of loklak walls!"
    },
    {
      "posted_on": "June 17, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "Robert",
      "title": "Under the hood: Authentication (login)",
      "content": "In the second post of .....key login is ready."
    },
    {
      "posted_on": "June 17, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "jigyasa",
      "title": "Loklak gives some hackernews now !",
      "content": "It's been befittingly said  u... Also, Stay tuned for more posts on data crawling and parsing for Loklak. Feedback and Suggestions welcome"
    },
    {
      "posted_on": "June 16, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "Damini",
      "title": "Does tweets have emotions?",
      "content": "Tweets do intend some kind o...t of features: classify(feat1,u2026,featN) = argmax(P(cat)*PROD(P(featI|cat)"
    },
    {
      "posted_on": "June 15, 2016",
      "blog_url": "https://loklaknet.wordpress.com/",
      "author": "sudheesh001",
      "title": "Dockerize the loklak server and publish docker images to IBM Containers on Bluemix Cloud",
      "content": "Docker is an open source...nd to create and deploy instantly as well as scale on demand."
    }
  ]
}

 

The next step now would include "writeToBackend"-ing and then parsing the JSONObject as desired.

Feel free to ask questions regarding the above code snippet, shall be happy to assist.

Feedback and Suggestions welcome 🙂

Now get wordpress blog updates with Loklak !

Loklak gives some hackernews now !

It’s been befittingly said  “Well, news is anything that’s interesting, that relates to what’s happening in the world, what’s happening in areas of the culture that would be of interest to your audience.” by Kurt Loder, the famous American Journalist.

And what better than Hackernews : news.ycombinator.com for the tech community. It helps community by showing the important and latest buzz and sort them by popularity and their links.

Screenshot from 2016-06-17 08:01:42

LOKLAK next tried to include this important piece of information in its server by collecting data from this source. Instead of the usual scraping of HTML Pages we had been doing for other sources before, we have tried to read the RSS stream instead.

Simply put, RSS (Really Simple Syndication) uses a family of standard web feed formats to publish frequently updated information: blog entries, news headlines, audio, video. A standard XML file format ensures compatibility with many different machines/programs. RSS feeds also benefit users who want to receive timely updates from favorite websites or to aggregate data from many sites without signing-in and all.

Hackernews RSS Feed can be fetched via the URL https://news.ycombinator.com/rss and looks something like…

Screenshot from 2016-06-17 09:33:32

In order to keep things simple, I decided to use the ROME Framework to make a RSS Reader for Hackernews for Loklak.

Just for a quick introduction, ROME is a Java framework for RSS and Atom feeds. It’s open source and licensed under the Apache 2.0 license. ROME includes a set of parsers and generators for the various flavors of syndication feeds, as well as converters to convert from one format to another. The parsers can give you back Java objects that are either specific for the format you want to work with, or a generic normalized SyndFeed class that lets you work on with the data without bothering about the incoming or outgoing feed type.

So, I made a function hackernewsRSSReader which basically returns us a JSONObject of JSONArray “Hackernews RSS Feed[]” having JSONObjects each of which represents a ‘news headline’ from the source.

The structure of the JSONObject result obtained is something like:

{
   "Hackernews RSS Feed":[
      {
         "Description":"SyndContentImpl.value=....",
         "Updated-Date":"null",
         "Link":"http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.116.241103",
         "RSS Feed":"https://news.ycombinator.com/rss",
         "Published-Date":"Wed Jun 15 13:30:33 EDT 2016",
         "Hash-Code":"1365366114",
         "Title":"Second Gravitational Wave Detected at LIGO",
         "URI":"http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.116.241103"
      },
     ......
      {
         "Description":"SyndContentImpl.value=....",
         "Updated-Date":"null",
         "Link":"http://ocw.mit.edu/courses/aeronautics-and-astronautics/16-410-principles-of-autonomy-and-decision-making-fall-2010/lecture-notes/MIT16_410F10_lec20.pdf",
         "RSS Feed":"https://news.ycombinator.com/rss",
         "Published-Date":"Wed Jun 15 08:37:36 EDT 2016",
         "Hash-Code":"1649214835",
         "Title":"Intro to Hidden Markov Models (2010) [pdf]",
         "URI":"http://ocw.mit.edu/courses/aeronautics-and-astronautics/16-410-principles-of-autonomy-and-decision-making-fall-2010/lecture-notes/MIT16_410F10_lec20.pdf"
      }
   ]
}

It includes information like Title, Link, HashCode, Published Date, Updated Date, URI and the Description of each “news headline”.

The next step after extracting information is to write it to the back-end and then retrieve it whenever required and display it in the desired format as suitable to the Loklak Web Client after parsing it.

It requires JDOM and ROME jars to be configured into the build path before proceeding with implementation of the RSS Reader.

A look through the code for the HackernewsRSSReader.java :

/**
 *  Hacker News RSS Reader
 *  By Jigyasa Grover, @jig08
 **/

package org.loklak.harvester;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import org.json.JSONArray;
import org.json.JSONObject;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;

public class HackernewsRSSReader {	
	
	/*
	 * For HackernewsRSS, simply pass URL: https://news.ycombinator.com/rss 
	 * in the function to obtain a corresponding JSON
	 */
	@SuppressWarnings({ "unchecked", "static-access" })
	public static JSONObject hackernewsRSSReader(String url){
		 
	        URL feedUrl = null;
			try {
				feedUrl = new URL(url);
			} catch (MalformedURLException e) {
				e.printStackTrace();
			}
	        
	        SyndFeedInput input = new SyndFeedInput();
	        
	        SyndFeed feed = null;
			try {
				feed = input.build(new XmlReader(feedUrl));
			} catch (Exception e) {
				e.printStackTrace();
			}
	        
	        String[][] result = new String[100][7];
	        //result[][0] = Title
	        //result[][1] = Link
	        //result[][2] = URI
	        //result[][3] = Hash Code
	        //result[][4] = PublishedDate
	        //result[][5] = Updated Date
	        //result[][6] = Description
	        
	        @SuppressWarnings("unused")
			int totalEntries = 0;
	        int i = 0;
	        
	        JSONArray jsonArray = new JSONArray();
	        
	        for (SyndEntry entry : (List)feed.getEntries()) {
	        	
	        	result[i][0] = entry.getTitle().toString();
	        	result[i][1] = entry.getLink().toString();
	        	result[i][2] = entry.getUri().toString();
	        	result[i][3] = Integer.toString(entry.hashCode()); 
	        	result[i][4] = entry.getPublishedDate().toString();
	        	result[i][5] = ( (entry.getUpdatedDate() == null) ? ("null") : (entry.getUpdatedDate().toString()) );
	        	result[i][6] = entry.getDescription().toString();
	        	
		        JSONObject jsonObject = new JSONObject();

	        	jsonObject.put("RSS Feed", url);
	        	jsonObject.put("Title", result[i][0]);
	        	jsonObject.put("Link", result[i][1]);
	        	jsonObject.put("URI", result[i][2]);
	        	jsonObject.put("Hash-Code", result[i][3]);
	        	jsonObject.put("Published-Date", result[i][4]);
	        	jsonObject.put("Updated-Date", result[i][5]);
	        	jsonObject.put("Description", result[i][6]);
	        	
	        	jsonArray.put(i, jsonObject);
	        	
	        	i++;
	        }
	        
	        totalEntries = i;
	        
	    JSONObject rssFeed = new JSONObject();
	    rssFeed.put("Hackernews RSS Feed", jsonArray);
	    System.out.println(rssFeed);
		return rssFeed;
		
	}

}

 

Feel free to ask questions regarding the above code snippet.

Also, Stay tuned for more posts on data crawling and parsing for Loklak.

Feedback and Suggestions welcome 🙂

Loklak gives some hackernews now !