Sunday, February 17, 2013

Getting Top Search Queries report with Google Webmaster Tools API

Google has a good and free tool that is very useful for search engine optimization analysis. I'm talking about Google Webmaster Tools. It's possible to download the report from Google WMT using API. It's an example how to do it for Top Queries report.

At first you need to download gdata-webmastertools-2.0.jar and gdata-client-1.0.jar. You can take them from http://gdata-java-client.googlecode.com/files/gdata-src.java-1.47.1.zip (there in the lib folder you'll find the jars).

Then you'll need to place them into you PATH. I will use maven. At first I will install these jars:
mvn install:install-file -Dfile=gdata-webmastertools-2.0.jar -DgroupId=com.google.gdata -DartifactId=gdata-webmastertools -Dversion=2.0 -Dpackaging=jar -DgeneratePom=true
mvn install:install-file -Dfile=gdata-client-1.0.jar -DgroupId=com.google.gdata -DartifactId=gdata-client -Dversion=1.0 -Dpackaging=jar -DgeneratePom=true
After this I will add dependencies to my pom.xml:
<dependency>
    <groupId>com.google.gdata</groupId>
    <artifactId>gdata-client</artifactId>
    <version>1.0</version>
</dependency>
<dependency>
    <groupId>com.google.gdata</groupId>
    <artifactId>gdata-webmastertools</artifactId>
    <version>2.0</version>
</dependency>
Also we'll need to parse JSON. I will use Jackson library:
<dependency>
    <groupId>org.codehaus.jackson</groupId>
    <artifactId>jackson-mapper-asl</artifactId>
    <version>1.8.5</version>
</dependency>
And finally we are ready to get the report from Google WMT API. (In this example I also used Apache Commons and log4j but it's not necessary, so you can get rid off those dependencies)
package loader;

import com.google.gdata.client.Service.GDataRequest;
import com.google.gdata.client.Service.GDataRequest.RequestType;
import com.google.gdata.client.webmastertools.WebmasterToolsService;
import com.google.gdata.data.OutOfLineContent;
import com.google.gdata.data.webmastertools.SitesEntry;
import com.google.gdata.data.webmastertools.SitesFeed;
import com.google.gdata.util.*;
import org.apache.log4j.Logger;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import org.apache.commons.lang.time.DateUtils;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Map;

public class GoogleWMTClient {
    private final static Logger LOGGER = Logger.getLogger(GoogleWMTClient.class);

    private final static ArrayList<String> STATISTIC_TYPE = new ArrayList<String>();
    static {
        STATISTIC_TYPE.add("ALL");
        // STATISTIC_TYPE.add("WEB");
        // STATISTIC_TYPE.add("IMAGE");
        // STATISTIC_TYPE.add("VIDEO");
        // STATISTIC_TYPE.add("MOBILE_SMARTPHONE");
        // STATISTIC_TYPE.add("MOBILE_RESTRICT");
    }
    private final static String GOOGLE_HOST = "www.google.com";
    private final static String DOWNLOAD_LIST_URL_PART = "/webmasters/tools/downloads-list?hl=%s&siteUrl=%s";
    private final static String SITES_FEED_URL_PART = "/webmasters/tools/feeds/sites/";    
    private final static String DATE_FORMAT = "yyyyMMdd";
    private final static String APPLICATION_NAME = "JavaDevTips";


    public static void main(String[] args) {
        pullData(Constants.GWMT_URL, Constants.GWMT_LANGUAGE_CODE, new Date());
    }
    
    public static void pullData(String url, String languageCode, Date endDate) {
        LOGGER.info("Download GWMT data for endDate: " + endDate + " and url: " + url);
        try {
            WebmasterToolsService service = initService(Constants.ADWORDS_USER, Constants.ADWORDS_PASSWORD);
            // used for deletion of newly created SitesEntry
            boolean newEntry = false;
            SitesEntry entry = findSitesEntry(service, url);
            if (entry == null) {
                newEntry = true;
                try {
                    entry = insertSiteEntry(service, url);
                } catch (ServiceForbiddenException ex) {
                    LOGGER.error(ex, ex);
                }
            }
            downloadReports(service, entry, endDate, languageCode);

            if (newEntry) {
                deleteSiteEntry(service, url);
            }
        } catch (ServiceException e) {
            LOGGER.error(e, e);
        } catch (IOException e) {
            LOGGER.error(e, e);
        }
    }

    public static WebmasterToolsService initService(String userName, String password) throws AuthenticationException {
        WebmasterToolsService service = new WebmasterToolsService(APPLICATION_NAME);
        service.setUserCredentials(userName, password);
        return service;
    }

    private static SitesEntry findSitesEntry(WebmasterToolsService service, String siteUrl) throws IOException,
            ServiceException {
        siteUrl = correctSiteUrl(siteUrl);
        LOGGER.info("Trying to find SitesEntry for " + siteUrl);
        SitesFeed sitesResultFeed = service.getFeed(getGoogleUrl(SITES_FEED_URL_PART), SitesFeed.class);
        for (SitesEntry entry : sitesResultFeed.getEntries()) {
            if (entry.getTitle().getPlainText().equals(siteUrl)) {
                LOGGER.info("SitesEntry is found");
                return entry;
            }
        }
        LOGGER.info("SitesEntry for " + siteUrl + " not found");
        return null;
    }

    private static URL getGoogleUrl(String path) throws MalformedURLException {
        return new URL("https://" + GOOGLE_HOST + path);
    }

    private static String correctSiteUrl(String siteUrl) {
        siteUrl = siteUrl.trim();
        if (!siteUrl.endsWith("/")) {
            siteUrl += "/";
        }
        if (!siteUrl.startsWith("http")) {
            siteUrl = "http://" + siteUrl;
        }
        return siteUrl;
    }

    private static void downloadReports(WebmasterToolsService service, SitesEntry entry, Date endDate,
            String languageCode) throws IOException, ServiceException {
        LOGGER.info("Downloading reports for " + entry.getTitle().getPlainText());
        Date startDate = DateUtils.addDays(endDate, (-1) * Constants.DATA_PERIOD);
        ObjectMapper mapper = new ObjectMapper();
        InputStream inputStream = getQueryInputStream(service, entry, languageCode);
        if (inputStream == null) {
            LOGGER.error("Empty InputStream");
            return;
        }
        Map<String, Object> map = mapper.readValue(inputStream, new TypeReference<Map<String, Object>>() {
        });
        if (map != null) {
            String fileName = null;
            SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT);            
            for (String prop : STATISTIC_TYPE) {
                StringBuilder sbPath = new StringBuilder((String) map.get("TOP_QUERIES")).append("&prop=" + prop)
                        .append("&db=" + sdf.format(startDate))
                        .append("&de=" + sdf.format(endDate));

                fileName = "gwmt_" + sdf.format(endDate) + prop + ".csv" ;
                OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
                boolean hasData = downloadData(service, sbPath.toString(), out);
                if (!hasData) {
                    LOGGER.info("File contains no data. Deleting.");
                    new File(fileName).delete(); // if the file contain no data we delete it
                }
                out.close();
            }
        }
    }

    private static boolean downloadData(WebmasterToolsService service, String path, OutputStreamWriter out)
            throws IOException, ServiceException {
        LOGGER.info("Downloading data for " + path);
        String data;
        URL url = getGoogleUrl(path);
        GDataRequest req = service.createRequest(RequestType.QUERY, url, ContentType.TEXT_PLAIN);
        req.execute();
        BufferedReader in = new BufferedReader(new InputStreamReader(req.getResponseStream()));
        if (in.readLine() != null) {
            while ((data = in.readLine()) != null) {
                out.write(data + "\n");
            }
            return true;
        } else {
            return false;
        }
    }

    private static InputStream getQueryInputStream(WebmasterToolsService service, SitesEntry entry, String lang)
            throws IOException, ServiceException {
        URL url = getGoogleUrl(String.format(DOWNLOAD_LIST_URL_PART, lang, entry.getTitle().getPlainText()));
        GDataRequest req = service.createRequest(RequestType.QUERY, url, ContentType.JSON);
        try {
            req.execute();
            return req.getResponseStream();
        } catch (RedirectRequiredException e) {
            LOGGER.error(e, e);
        }
        return null;
    }

    private static SitesEntry insertSiteEntry(WebmasterToolsService myService, String siteUrl) throws IOException,
            ServiceException {
        siteUrl = correctSiteUrl(siteUrl);
        SitesEntry entry = new SitesEntry();
        OutOfLineContent content = new OutOfLineContent();
        content.setUri(siteUrl);
        entry.setContent(content);
        LOGGER.info("Adding SitesEntry for  " + siteUrl);
        return myService.insert(getGoogleUrl(SITES_FEED_URL_PART), entry);
    }

    private static void deleteSiteEntry(WebmasterToolsService myService, String siteUrl) throws IOException,
            ServiceException {
        siteUrl = correctSiteUrl(siteUrl);
        String siteId = URLEncoder.encode(siteUrl, "UTF-8");
        URL feedUrl = new URL(getGoogleUrl(SITES_FEED_URL_PART) + siteId);
        SitesEntry entry = myService.getEntry(feedUrl, SitesEntry.class);
        LOGGER.info("Deleting SitesEntry for " + siteUrl);
        entry.delete();
    }
}