\documentclass[11pt,twoside]{article}\makeatletter

\IfFileExists{xcolor.sty}%
  {\RequirePackage{xcolor}}%
  {\RequirePackage{color}}
\usepackage{colortbl}
\usepackage{wrapfig}
\usepackage{ifxetex}
\ifxetex
  \usepackage{fontspec}
  \usepackage{xunicode}
  \catcode`⃥=\active \def⃥{\textbackslash}
  \catcode`❴=\active \def❴{\{}
  \catcode`❵=\active \def❵{\}}
  \def\textJapanese{\fontspec{Noto Sans CJK JP}}
  \def\textChinese{\fontspec{Noto Sans CJK SC}}
  \def\textKorean{\fontspec{Noto Sans CJK KR}}
  \setmonofont{DejaVu Sans Mono}
  
\else
  \IfFileExists{utf8x.def}%
   {\usepackage[utf8x]{inputenc}
      \PrerenderUnicode{–}
    }%
   {\usepackage[utf8]{inputenc}}
  \usepackage[english]{babel}
  \usepackage[T1]{fontenc}
  \usepackage{float}
  \usepackage[]{ucs}
  \uc@dclc{8421}{default}{\textbackslash }
  \uc@dclc{10100}{default}{\{}
  \uc@dclc{10101}{default}{\}}
  \uc@dclc{8491}{default}{\AA{}}
  \uc@dclc{8239}{default}{\,}
  \uc@dclc{20154}{default}{ }
  \uc@dclc{10148}{default}{>}
  \def\textschwa{\rotatebox{-90}{e}}
  \def\textJapanese{}
  \def\textChinese{}
  \IfFileExists{tipa.sty}{\usepackage{tipa}}{}
\fi
\def\exampleFont{\ttfamily\small}
\DeclareTextSymbol{\textpi}{OML}{25}
\usepackage{relsize}
\RequirePackage{array}
\def\@testpach{\@chclass
 \ifnum \@lastchclass=6 \@ne \@chnum \@ne \else
  \ifnum \@lastchclass=7 5 \else
   \ifnum \@lastchclass=8 \tw@ \else
    \ifnum \@lastchclass=9 \thr@@
   \else \z@
   \ifnum \@lastchclass = 10 \else
   \edef\@nextchar{\expandafter\string\@nextchar}%
   \@chnum
   \if \@nextchar c\z@ \else
    \if \@nextchar l\@ne \else
     \if \@nextchar r\tw@ \else
   \z@ \@chclass
   \if\@nextchar |\@ne \else
    \if \@nextchar !6 \else
     \if \@nextchar @7 \else
      \if \@nextchar (8 \else
       \if \@nextchar )9 \else
  10
  \@chnum
  \if \@nextchar m\thr@@\else
   \if \@nextchar p4 \else
    \if \@nextchar b5 \else
   \z@ \@chclass \z@ \@preamerr \z@ \fi \fi \fi \fi
   \fi \fi  \fi  \fi  \fi  \fi  \fi \fi \fi \fi \fi \fi}
\gdef\arraybackslash{\let\\=\@arraycr}
\def\@textsubscript#1{{\m@th\ensuremath{_{\mbox{\fontsize\sf@size\z@#1}}}}}
\def\Panel#1#2#3#4{\multicolumn{#3}{){\columncolor{#2}}#4}{#1}}
\def\abbr{}
\def\corr{}
\def\expan{}
\def\gap{}
\def\orig{}
\def\reg{}
\def\ref{}
\def\sic{}
\def\persName{}\def\name{}
\def\placeName{}
\def\orgName{}
\def\textcal#1{{\fontspec{Lucida Calligraphy}#1}}
\def\textgothic#1{{\fontspec{Lucida Blackletter}#1}}
\def\textlarge#1{{\large #1}}
\def\textoverbar#1{\ensuremath{\overline{#1}}}
\def\textquoted#1{‘#1’}
\def\textsmall#1{{\small #1}}
\def\textsubscript#1{\@textsubscript{\selectfont#1}}
\def\textxi{\ensuremath{\xi}}
\def\titlem{\itshape}
\newenvironment{biblfree}{}{\ifvmode\par\fi }
\newenvironment{bibl}{}{}
\newenvironment{byline}{\vskip6pt\itshape\fontsize{16pt}{18pt}\selectfont}{\par }
\newenvironment{citbibl}{}{\ifvmode\par\fi }
\newenvironment{docAuthor}{\ifvmode\vskip4pt\fontsize{16pt}{18pt}\selectfont\fi\itshape}{\ifvmode\par\fi }
\newenvironment{docDate}{}{\ifvmode\par\fi }
\newenvironment{docImprint}{\vskip 6pt}{\ifvmode\par\fi }
\newenvironment{docTitle}{\vskip6pt\bfseries\fontsize{22pt}{25pt}\selectfont}{\par }
\newenvironment{msHead}{\vskip 6pt}{\par}
\newenvironment{msItem}{\vskip 6pt}{\par}
\newenvironment{rubric}{}{}
\newenvironment{titlePart}{}{\par }

\newcolumntype{L}[1]{){\raggedright\arraybackslash}p{#1}}
\newcolumntype{C}[1]{){\centering\arraybackslash}p{#1}}
\newcolumntype{R}[1]{){\raggedleft\arraybackslash}p{#1}}
\newcolumntype{P}[1]{){\arraybackslash}p{#1}}
\newcolumntype{B}[1]{){\arraybackslash}b{#1}}
\newcolumntype{M}[1]{){\arraybackslash}m{#1}}
\definecolor{label}{gray}{0.75}
\def\unusedattribute#1{\sout{\textcolor{label}{#1}}}
\DeclareRobustCommand*{\xref}{\hyper@normalise\xref@}
\def\xref@#1#2{\hyper@linkurl{#2}{#1}}
\begingroup
\catcode`\_=\active
\gdef_#1{\ensuremath{\sb{\mathrm{#1}}}}
\endgroup
\mathcode`\_=\string"8000
\catcode`\_=12\relax

\usepackage[a4paper,twoside,lmargin=1in,rmargin=1in,tmargin=1in,bmargin=1in,marginparwidth=0.75in]{geometry}
\usepackage{framed}

\definecolor{shadecolor}{gray}{0.95}
\usepackage{longtable}
\usepackage[normalem]{ulem}
\usepackage{fancyvrb}
\usepackage{fancyhdr}
\usepackage{graphicx}
\usepackage{marginnote}

\renewcommand{\@cite}[1]{#1}


\renewcommand*{\marginfont}{\itshape\footnotesize}

\def\Gin@extensions{.pdf,.png,.jpg,.mps,.tif}

  \pagestyle{fancy}

\usepackage[pdftitle={Big Data Analysis: Ap Spark Perspective},
 pdfauthor={}]{hyperref}
\hyperbaseurl{}

	 \paperwidth210mm
	 \paperheight297mm
              
\def\@pnumwidth{1.55em}
\def\@tocrmarg {2.55em}
\def\@dotsep{4.5}
\setcounter{tocdepth}{3}
\clubpenalty=8000
\emergencystretch 3em
\hbadness=4000
\hyphenpenalty=400
\pretolerance=750
\tolerance=2000
\vbadness=4000
\widowpenalty=10000

\renewcommand\section{\@startsection {section}{1}{\z@}%
     {-1.75ex \@plus -0.5ex \@minus -.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\Large\bfseries}}
\renewcommand\subsection{\@startsection{subsection}{2}{\z@}%
     {-1.75ex\@plus -0.5ex \@minus- .2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\Large}}
\renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}%
     {-1.5ex\@plus -0.35ex \@minus -.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\large}}
\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
     {-1ex \@plus-0.35ex \@minus -0.2ex}%
     {0.5ex \@plus .2ex}%
     {\reset@font\normalsize}}
\renewcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}%
     {1.5ex \@plus1ex \@minus .2ex}%
     {-1em}%
     {\reset@font\normalsize\bfseries}}


\def\l@section#1#2{\addpenalty{\@secpenalty} \addvspace{1.0em plus 1pt}
 \@tempdima 1.5em \begingroup
 \parindent \z@ \rightskip \@pnumwidth 
 \parfillskip -\@pnumwidth 
 \bfseries \leavevmode #1\hfil \hbox to\@pnumwidth{\hss #2}\par
 \endgroup}
\def\l@subsection{\@dottedtocline{2}{1.5em}{2.3em}}
\def\l@subsubsection{\@dottedtocline{3}{3.8em}{3.2em}}
\def\l@paragraph{\@dottedtocline{4}{7.0em}{4.1em}}
\def\l@subparagraph{\@dottedtocline{5}{10em}{5em}}
\@ifundefined{c@section}{\newcounter{section}}{}
\@ifundefined{c@chapter}{\newcounter{chapter}}{}
\newif\if@mainmatter 
\@mainmattertrue
\def\chaptername{Chapter}
\def\frontmatter{%
  \pagenumbering{roman}
  \def\thechapter{\@roman\c@chapter}
  \def\theHchapter{\roman{chapter}}
  \def\thesection{\@roman\c@section}
  \def\theHsection{\roman{section}}
  \def\@chapapp{}%
}
\def\mainmatter{%
  \cleardoublepage
  \def\thechapter{\@arabic\c@chapter}
  \setcounter{chapter}{0}
  \setcounter{section}{0}
  \pagenumbering{arabic}
  \setcounter{secnumdepth}{6}
  \def\@chapapp{\chaptername}%
  \def\theHchapter{\arabic{chapter}}
  \def\thesection{\@arabic\c@section}
  \def\theHsection{\arabic{section}}
}
\def\backmatter{%
  \cleardoublepage
  \setcounter{chapter}{0}
  \setcounter{section}{0}
  \setcounter{secnumdepth}{2}
  \def\@chapapp{\appendixname}%
  \def\thechapter{\@Alph\c@chapter}
  \def\theHchapter{\Alph{chapter}}
  \appendix
}
\newenvironment{bibitemlist}[1]{%
   \list{\@biblabel{\@arabic\c@enumiv}}%
       {\settowidth\labelwidth{\@biblabel{#1}}%
        \leftmargin\labelwidth
        \advance\leftmargin\labelsep
        \@openbib@code
        \usecounter{enumiv}%
        \let\p@enumiv\@empty
        \renewcommand\theenumiv{\@arabic\c@enumiv}%
	}%
  \sloppy
  \clubpenalty4000
  \@clubpenalty \clubpenalty
  \widowpenalty4000%
  \sfcode`\.\@m}%
  {\def\@noitemerr
    {\@latex@warning{Empty `bibitemlist' environment}}%
    \endlist}

\def\tableofcontents{\section*{\contentsname}\@starttoc{toc}}
\parskip0pt
\parindent1em
\def\Panel#1#2#3#4{\multicolumn{#3}{){\columncolor{#2}}#4}{#1}}
\newenvironment{reflist}{%
  \begin{raggedright}\begin{list}{}
  {%
   \setlength{\topsep}{0pt}%
   \setlength{\rightmargin}{0.25in}%
   \setlength{\itemsep}{0pt}%
   \setlength{\itemindent}{0pt}%
   \setlength{\parskip}{0pt}%
   \setlength{\parsep}{2pt}%
   \def\makelabel##1{\itshape ##1}}%
  }
  {\end{list}\end{raggedright}}
\newenvironment{sansreflist}{%
  \begin{raggedright}\begin{list}{}
  {%
   \setlength{\topsep}{0pt}%
   \setlength{\rightmargin}{0.25in}%
   \setlength{\itemindent}{0pt}%
   \setlength{\parskip}{0pt}%
   \setlength{\itemsep}{0pt}%
   \setlength{\parsep}{2pt}%
   \def\makelabel##1{\upshape ##1}}%
  }
  {\end{list}\end{raggedright}}
\newenvironment{specHead}[2]%
 {\vspace{20pt}\hrule\vspace{10pt}%
  \phantomsection\label{#1}\markright{#2}%

  \pdfbookmark[2]{#2}{#1}%
  \hspace{-0.75in}{\bfseries\fontsize{16pt}{18pt}\selectfont#2}%
  }{}
      \def\TheFullDate{2015-01-15 (revised: 15 January 2015)}
\def\TheID{\makeatother }
\def\TheDate{2015-01-15}
\title{Big Data Analysis: Ap Spark Perspective}
\author{}\makeatletter 
\makeatletter
\newcommand*{\cleartoleftpage}{%
  \clearpage
    \if@twoside
    \ifodd\c@page
      \hbox{}\newpage
      \if@twocolumn
        \hbox{}\newpage
      \fi
    \fi
  \fi
}
\makeatother
\makeatletter
\thispagestyle{empty}
\markright{\@title}\markboth{\@title}{\@author}
\renewcommand\small{\@setfontsize\small{9pt}{11pt}\abovedisplayskip 8.5\p@ plus3\p@ minus4\p@
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip \z@ plus2\p@
\belowdisplayshortskip 4\p@ plus2\p@ minus2\p@
\def\@listi{\leftmargin\leftmargini
               \topsep 2\p@ plus1\p@ minus1\p@
               \parsep 2\p@ plus\p@ minus\p@
               \itemsep 1pt}
}
\makeatother
\fvset{frame=single,numberblanklines=false,xleftmargin=5mm,xrightmargin=5mm}
\fancyhf{} 
\setlength{\headheight}{14pt}
\fancyhead[LE]{\bfseries\leftmark} 
\fancyhead[RO]{\bfseries\rightmark} 
\fancyfoot[RO]{}
\fancyfoot[CO]{\thepage}
\fancyfoot[LO]{\TheID}
\fancyfoot[LE]{}
\fancyfoot[CE]{\thepage}
\fancyfoot[RE]{\TheID}
\hypersetup{citebordercolor=0.75 0.75 0.75,linkbordercolor=0.75 0.75 0.75,urlbordercolor=0.75 0.75 0.75,bookmarksnumbered=true}
\fancypagestyle{plain}{\fancyhead{}\renewcommand{\headrulewidth}{0pt}}

\date{}
\usepackage{authblk}

\providecommand{\keywords}[1]
{
\footnotesize
  \textbf{\textit{Index terms---}} #1
}

\usepackage{graphicx,xcolor}
\definecolor{GJBlue}{HTML}{273B81}
\definecolor{GJLightBlue}{HTML}{0A9DD9}
\definecolor{GJMediumGrey}{HTML}{6D6E70}
\definecolor{GJLightGrey}{HTML}{929497} 

\renewenvironment{abstract}{%
   \setlength{\parindent}{0pt}\raggedright
   \textcolor{GJMediumGrey}{\rule{\textwidth}{2pt}}
   \vskip16pt
   \textcolor{GJBlue}{\large\bfseries\abstractname\space}
}{%   
   \vskip8pt
   \textcolor{GJMediumGrey}{\rule{\textwidth}{2pt}}
   \vskip16pt
}

\usepackage[absolute,overlay]{textpos}

\makeatother 
      \usepackage{lineno}
      \linenumbers
      
\begin{document}

             \author[1]{Abdul Ghaffar  Shoro}

             \author[2]{Tariq Rahim  Soomro}

             \affil[1]{  SZABIST Dubai Campus}

\renewcommand\Authands{ and }

\date{\small \em Received: 7 December 2014 Accepted: 5 January 2015 Published: 15 January 2015}

\maketitle


\begin{abstract}
        


Big Data have gained enormous attention in recent years. Analyzing big data is very common requirement today and such requirements become nightmare when analyzing of bulk data source such as twitter twits are done, it is really a big challenge to analyze the bulk amount of twits to get relevance and different patterns of information on timely manner.  This paper will explore the concept of Big Data Analysis and recognize some meaningful information from some sample big data source, such as Twitter twits, using one of industries emerging tool, known as Spark by Apache.

\end{abstract}


\keywords{big data analysis, twitter, apache spark, apache hadoop, open source.}

\begin{textblock*}{18cm}(1cm,1cm) % {block width} (coords) 
\textcolor{GJBlue}{\LARGE Global Journals \LaTeX\ JournalKaleidoscope\texttrademark}
\end{textblock*}

\begin{textblock*}{18cm}(1.4cm,1.5cm) % {block width} (coords) 
\textcolor{GJBlue}{\footnotesize \\ Artificial Intelligence formulated this projection for compatibility purposes from the original article published at Global Journals. However, this technology is currently in beta. \emph{Therefore, kindly ignore odd layouts, missed formulae, text, tables, or figures.}}
\end{textblock*}


\let\tabcellsep& 	 	 		 \par
Introduction n today's computer age, our life has become pretty much dependent on technological gadgets and more or less all aspects of human life, such as personal, social and professional are fully covered with technology. More or less all the above aspects are dealing with some sort of data; due to immense increase in complexity of data due to rapid growth required speed and variety have originated new challenges in the life of data management. This is where Big Data term has given a birth. Accessing, Analyzing, Securing and Storing big data are one of most spoken terms in today's technological world. Big Data analysis is a process of gathering data from different resources and then organizing that data in meaning full way and then analyzing those big sets of data to discover meaningful facts and figures from that data collection. This analysis of data not only helps to determine the hidden facts and figures of information in bulk of big data, but also it provides with categorize the data or rank the data with respect to important of information it provides. In short big data analysis is the process of finding knowledge from bulk variety of data. Twitter as organization itself processes approximately 10k tweets per second before publishing them for public, they analyze all this data with this extreme fast rate, to ensure every tweet is following decency policy and restricted words are filtered out from tweets. All this analyzing process must be done in real time to avoid delays in publishing twits live for public; for example business like Forex Trading analyze social data to predict future public trends. To analyze such huge data it is required to use some kind of analysis tool. This paper focuses on open source tool Apache Spark. Spark is a cluster computing system from Apache with incubator status; this tool is specialized at making data analysis faster, it Author ? ?: Department of Computing, SZABIST Dubai Campus, Dubai, UAE. e-mails: shoroghaffar@gmail.com, tariq@szabist.ac.ae is pretty fast at both running programs as well as writing data. Spark supports in-memory computing, that enables it to query data much faster compared to diskbased engines such as Hadoop, and also it offers a general execution model that can optimize arbitrary operator graph \hyperref[b0]{[1]}. This paper organized as follows: section 2 focus on literature review exploring the Big Data Analysis \& its tools and recognize some meaningful information from some sample big data source, such as Twitter feeds, using one of industries emerging tool, Apache Spark along with justification of using Spark; section 3 will discuss material and method; section 4 will discuss the results of analyzing of big data using Spark; and finally discussion and future work will be highlighted in section 5. 
\section[{II.}]{II.} 
\section[{Literature Review a) Big Data}]{Literature Review a) Big Data}\par
A very popular description for the exponential growth and availability of huge amount of data with all possible variety is popularly termed as Big Data. This is one of the most spoke about terms in today's automated world and perhaps big data is becoming of equal importance to business and society as the Internet has been. It is widely believed and proved that more data leads to more accurate analysis, and of course more accurate analysis could lead to more legitimate, timely and confident decision making, as a result, better judgment and decisions more likely means higher operational efficiencies, reduced risk and cost reductions \hyperref[b1]{[2]}. Big Data researchers visualize big data as follows:\par
i. Volume-wise This is the one of the most important factors, contributed to emergence of big data. Data volume is multiplying to various factors. Organizations and governments has been recording transactional data for decades, social media continuously pumping steams of unstructured data, automation, sensors data, machineto-machine data and so much more. Formerly, data storage was itself an issue, but thanks to advance and affordable storage devices, today, storage itself is not a big challenge but volume still contributes to other challenges, such as, determining the relevance within massive data volumes as well as collecting valuable information from data using analysis [3].\par
ii. Velocity-wise Volume of data is challenge but the pace at which it is increasing is a serious challenge to be dealt with time and efficiency. The Internet streaming, RFID Big Data Analysis: Ap Spark Perspective tags, automation and sensors, robotics and much more technology facilities, are actually driving the need to deal with huge pieces of data in real time. So velocity of data increase is one of big data challenge with standing in front of every big organization today \hyperref[b2]{[4]}.\par
iii. Variety-wise Rapidly growing huge volume of data is a big challenge but the variety of data is bigger challenge. Data is growing in variety of formats, structured, unstructured, relational and non-relational, different files systems, videos, images, multimedia, financial data, aviation data and scientific data etc. Now the challenge is to find means to correlate all variety of data timely to get value from this data. Today huge numbers of organizations are striving to get better solutions to this challenge [3]. 
\section[{iv. Variability-wise}]{iv. Variability-wise}\par
Rapidly growing data with increasing variety is what makes big data challenging but ups and downs in this trend of big data flow is also a big challenge, social media response to global events drives huge volumes of data and it is required to be analyzed on time before trend changes. Global events impact on financial markets, this overhead increase more while dealing with un-structured data \hyperref[b3]{[5]}. 
\section[{v. Complexity-wise}]{v. Complexity-wise}\par
All above factors make big data a really challenge, huge volumes, continuously multiplying with increasing variety of sources, and with unpredicted trends. Despite all those facts, big data much be processed to connect and correlate and create meaningful relational hierarchies and linkages right on time before this data go out of control. This pretty much explains the complexity involved in big data today \hyperref[b3]{[5]}.\par
To precise, any big data repository with following characteristics can be termed big data. The following are brief introduction of some of selected big data analysis tools along with brief overview of Apache Spark and finally justification of apache spark with other competitors to distinguish and justify use of Apache Spark. 
\section[{i. Apache Hive}]{i. Apache Hive}\par
Hive is a data warehousing infrastructure, which runs on top of Hadoop. It provides a language called Hive QL to organize, aggregate and run queries on the data. Hive QL is similar to SQL, using a declarative programming model \hyperref[b5]{[7]}. This differentiates the language from Pig Latin, which uses a more procedural approach. In Hive QL as in SQL the desired final results are described in one big query. In contrast, using Pig Latin, the query is built up step by step as a sequence of assignment operations.\par
Apache Hive enables developers specially SQL developers to write queries in Hive Query Language HQL. HQL is similar to standard query language. HQL queries can be broken down by Hive to communicate to MapReduce jobs executed across a Hadoop Cluster.\par
ii. Apache Pig Pig is a tool or in fact a platform to analyze huge volumes of big data. Substantial parallelization of tasks is a very key feature of Pig programs, which enables them to handle massive data sets \hyperref[b5]{[7]}. While Pig and Hive are meant to perform similar tasks \hyperref[b6]{[8]}. The Pig is better suited for the data preparation phase of data processing, while Hive fits the data warehousing and presentation scenario better. The idea is that as data is incrementally collected, it is first cleaned up using the tools provided by Pig and then stored. From that point on Hive is used to run ad-hoc queries analyzing the data. During this work the incremental buildup of a data warehouse is not enabled and both data preparation and querying are performed using Pig. The feasibility of using Pig and Hive in conjunction remains to be tested.\par
iii. Apache Zebra Apache Zebra is a kind of storage layer for data access at high level abstraction and especially tabular view for data available in Hadoop and relief's users of pig coming up with their own data storage models and retrieval codes. Zebra is a sub-project of Pig which provides a layer of abstraction between Pig Latin and the Hadoop Distributed File System \hyperref[b7]{[9]}. Zebra allows a Pig programmer to save relations in a table-oriented fashion (as opposed to flat text files, which are, normally used) along with meta-data describing the schema of each relation. The tests can be run using J Unit or a similar Java testing framework \hyperref[b8]{[10]}. 
\section[{iv. Apache H Base}]{iv. Apache H Base}\par
Apache H Base is a data base engine built using Hadoop and modeled after Google's Big Table. It is optimized for real time data access from tables of millions of columns and billions of rows. Among other features, H Base offers support for interfacing with Pig and Hive. The Pig API features a storage function for loading data from an H Base data base, but during this work the data was read from and written to flat HDFS files, because the data amounts were too small to necessitate the use of H Base \hyperref[b9]{[11]}.  \hyperref[b9]{[11]}. Because Chu kwa is meant mostly for the narrow area of log data processing, not general data analysis, the tools it offers are not as diverse as Pig's and not as well suited for the tasks performed in this work. 
\section[{vi. Apache Storm}]{vi. Apache Storm}\par
A dependable tool to process unbound streams of data or information. Storm is an ongoing distributed system for computation and it is an open source tool, currently undergoing incubation assessment with Apache. Storm performs the computation on live streams of data in same way traditional Hadoop does for batch processing. Storm was originally aimed at processing twitter streams, and now available as open source and being utilized in many organizations as stream processing tool. Apache spark is quick and reliable, scalable, and makes sure to transform information. It is also not very complex to be deployed and utilized \hyperref[b0]{[1]}. 
\section[{vii. Apache Spark}]{vii. Apache Spark}\par
Apache Spark is a general purpose cluster computing engine which is very fast and reliable. This system provides Application programing interfaces in various programing languages such as Java, Python, Scala. Spark is a cluster computing system from Apache with incubator status, this tool is specialized at making data analysis faster, it is pretty fast at both running programs as well as writing data. Spark supports in-memory computing, that enables it to query data much faster compared to disk-based engines such as Hadoop, and also it offers a general execution model that can optimize arbitrary operator graph. Initially system was developed at UC Berkeley's as research project and very quickly acquired incubator status in Apache in June 2013 \hyperref[b7]{[9]}. Generally speaking, Spark is advance and highly capable upgrade to Hadoop aimed at enhancing Hadoop ability of cutting edge analysis. Spark engine functions quite advance and different than Hadoop. Spark engine is developed for in-memory processing as well a disk based processing. This inmemory processing capability makes it much faster than any traditional data processing engine. For example project sensors report, logistic regression runtimes in Spark 100 x faster than Hadoop Map Reduce. This system also provides large number of impressive high level tools such as machine learning tool M Lib, structured data processing, Spark SQL, graph processing took Graph X, stream processing engine called Spark Streaming, and Shark for fast interactive question device. As shown in   
\section[{d) When Not to Use Apache Spark}]{d) When Not to Use Apache Spark}\par
Apache Spark is fasted General purpose big data analytics engine and it is very suitable for any kind of big data analysis. Only following two scenarios, can hinder the suitability of Apache spark \hyperref[b11]{[13]}. 
\section[{Global Journal of Computer Science and Technology (C) Volume XV Issue I Version I}]{Global Journal of Computer Science and Technology (C) Volume XV Issue I Version I}\par
Year 2015\par
? Low Tolerance to Latency requirements: If big data analysis are required to be performed on data streams and latency is the most crucial point rather anything else. In this case using Apache Storm may produce better results, but again reliability to be kept in mind.\par
? Shortage of Memory resources: Apache Spark is fasted general purpose engine due to the fact that it maintains all its current operations inside Memory. Hence requires access amount of memory, so in this case when available memory is very limited, Apache Hadoop Map Reduce may help better, considering huge performance gap.\par
III. 
\section[{Material and Methods}]{Material and Methods}\par
The nature of this paper is to cope with huge amount of data and process / analyze huge volume of data to extract some meaningful information from that data in real time.\par
The big data is modern day technology term that have changed the way world have looked at data and all of methods and principles towards data. The Data gather of big data is totally different than our traditional ways of data gathering and techniques. Coping with big data specially analyzing in real time has become almost impossible with traditional data warehousing techniques. This limitation have resulted a race of new innovations in data handling and analyzing field. Number of new technologies and tools have emerged and claiming to resolve big data analyzing challenges. So technically speaking, Twitter streaming API is used to access twitter's big data using Apache Spark. 
\section[{a) Research Instrument}]{a) Research Instrument}\par
? Twitter Stream API: The Streaming APIs provide push deliveries of Tweets and other events, for realtime or low-latency applications. Twitter API is well known source of big data and used worldwide in numerous applications of a number of objectives.\par
In fact there are some limitation in free Twitter API that should be considered while analyze the results. 
\section[{? Apache Spark: As an open source computing}]{? Apache Spark: As an open source computing}\par
framework to analyze the big data. Though apache spark is claiming to be fastest big data analyzing tool in market, but the trust level and validation of results will still be subject to comparison with some existing tools like Apache storm, for example. In this paper the data processing is happening using Twitter streaming API and Apache Spark as shown in Figure-3-1 bellow.  
\section[{Results}]{Results}\par
This section illustrates and analysis the data collected for the experiment purpose by Apache Spark using twitter streaming API. The amount of data processed for each scenario, processing time and results are given in tabular as well as graphical format. Following scenarios were executed for experiment purpose on live streams of twits on twitter. 1. Top ten words collected during a particular period of time. (10 minutes) 2. Top ten languages collected during a particular period of time. (10 minutes) 3. Number of times a particular "word" being used in twits, twitted in a particular period of time.\par
Scenario 1: Top ten words collected in last 10 minutes 
\section[{Statistics:}]{Statistics:}\par
? The total number of tweets analyzed during this time=23865 ? The total number of unique words =77548 ? The total number of words=160989 ? Total time duration=10 minutes (600 seconds).\par
? See Table \hyperref[tab_0]{4}-1 for top ten words in tabular form.\par
? See Figure 4     ? See Table \hyperref[tab_0]{4}-3 for number of twits posted using word "mtvstars" in tabular form ? See Figure 4-3 for number of twits posted using word "mtvstars" shown graphically in charts  Discussion \& Future Work\par
As not many organizations share their big data sources. So study was limited to twitter free feed API and all limitations of this API, such as amount of data per request and performance etc. and that directly impact the results presented. Also a common laptop was used to analyze tweets as compare to dedicated Server. As a result of this study, following Scenarios were considered and analyzed and their results were presented in previous section. 1. Top ten words twitted during last specific period of time. 2. Top ten languages used to twit during specific period of time. 3. A list of twitted items matching a given search keyword.\par
Considering the above mentioned limitations, Apache Spark was able to analyze streamed tweets with very minor latency of few seconds. Which proves that, despite being big general purpose, Interactive and flexible big data processing engine, Spark is very competitive in terms of stream processing as well. During the process of analyzing big data using spark, couple of improvement areas were identified as of utmost importance should be persuaded as future work. Firstly, like most open source tools, Apache Spark is not the easiest tool to work with. Especially deploying and configuring apache spark for custom requirements. A flexible, user friendly configuration and programming utility for apache spark will be a great addition to apache spark developer community. Secondly, analyzed data representation is poor, there is a very strong need to have powerful data representation tool to provide powerful reporting and KPI generation directly from Spark results, and having this utility in multiple languages will be a great added value.\begin{figure}[htbp]
\noindent\textbf{}\includegraphics[]{image-2.png}
\caption{\label{fig_0}I}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{}\includegraphics[]{image-3.png}
\caption{\label{fig_1}}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{}\includegraphics[]{image-4.png}
\caption{\label{fig_2}Global}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{}\includegraphics[]{image-5.png}
\caption{\label{fig_3}}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{21}\includegraphics[]{image-6.png}
\caption{\label{fig_4}Figure- 2 - 1 :}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{1}\includegraphics[]{image-7.png}
\caption{\label{fig_5}Figure-3- 1 :}\end{figure}
     \begin{figure}[htbp]
\noindent\textbf{4} \par 
\begin{longtable}{P{0.08550295857988166\textwidth}P{0.47278106508875734\textwidth}P{0.2917159763313609\textwidth}}
\tabcellsep \multicolumn{2}{l}{1 : Top ten words in last 10 minutes}\\
S. No.\tabcellsep Word\tabcellsep Frequency\\
1\tabcellsep Lady\tabcellsep 24005\\
2\tabcellsep Today\tabcellsep 20056\\
3\tabcellsep https\tabcellsep 26558\\
4\tabcellsep ?????\tabcellsep 2619\\
5\tabcellsep Love\tabcellsep 86288\\
6\tabcellsep ???\tabcellsep 29002\\
7\tabcellsep ??? ????? ???\tabcellsep 34406\\
8\tabcellsep 2014\tabcellsep 43101\\
9\tabcellsep Mtvstars\tabcellsep 99449\\
10\tabcellsep ???\tabcellsep 90619\end{longtable} \par
 
\caption{\label{tab_0}Table 4 -}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{4} \par 
\begin{longtable}{P{0.08163841807909604\textwidth}P{0.5570621468926553\textwidth}P{0.2112994350282486\textwidth}}
\tabcellsep \multicolumn{2}{l}{2 : top ten languages in last 10 minutes}\\
S. No.\tabcellsep Language\tabcellsep Frequency\\
1\tabcellsep Thai\tabcellsep 359\\
2\tabcellsep Korean\tabcellsep 426\\
3\tabcellsep French\tabcellsep 435\\
4\tabcellsep Turkish\tabcellsep 491\\
5\tabcellsep Indonesian\tabcellsep 621\\
6\tabcellsep Spanish\tabcellsep 1258\\
7\tabcellsep Arabic\tabcellsep 1560\\
8\tabcellsep Russian\tabcellsep 2109\\
9\tabcellsep Japanese\tabcellsep 6957\\
10\tabcellsep English\tabcellsep 8114\end{longtable} \par
 
\caption{\label{tab_1}Table 4 -}\end{figure}
 \begin{figure}[htbp]
\noindent\textbf{4} \par 
\begin{longtable}{P{0.12697095435684647\textwidth}P{0.12344398340248963\textwidth}P{0.15518672199170122\textwidth}P{0.14460580912863072\textwidth}P{0.15518672199170122\textwidth}P{0.14460580912863072\textwidth}}
Twits\tabcellsep Time duration\tabcellsep Twits\tabcellsep Time duration in\tabcellsep Twits\tabcellsep Time duration in\\
frequency\tabcellsep in seconds\tabcellsep frequency\tabcellsep seconds\tabcellsep frequency\tabcellsep seconds\\
405\tabcellsep 15\tabcellsep 15051\tabcellsep 215\tabcellsep 29158\tabcellsep 415\\
100\tabcellsep 22\tabcellsep 15401\tabcellsep 221\tabcellsep 29589\tabcellsep 421\\
1444\tabcellsep 29\tabcellsep 15557\tabcellsep 227\tabcellsep 30017\tabcellsep 427\\
2031\tabcellsep 35\tabcellsep 16281\tabcellsep 233\tabcellsep 30374\tabcellsep 433\\
2876\tabcellsep 41\tabcellsep 16689\tabcellsep 240\tabcellsep 30939\tabcellsep 442\\
3570\tabcellsep 47\tabcellsep 17104\tabcellsep 246\tabcellsep 31601\tabcellsep 448\end{longtable} \par
 
\caption{\label{tab_2}Table 4 -}\end{figure}
 			\footnote{© 2015 Global Journals Inc. (US)} 		 		\backmatter  			  				\begin{bibitemlist}{1}
\bibitem[Marketwired ()]{b6}\label{b6} 	 		\textit{},  		 			Marketwired 		.  		 \url{http://www.marketwired.com/press-release/apache-spark-beats-the-world-record-forfastest-processing-of-big-data-1956518.htm}  		2014.  	 
\bibitem[Donkin (2014)]{b7}\label{b7} 	 		\textit{},  		 			R B Donkin 		.  		 \url{http://people.apache.org/\textasciitilde rdonkin/hadooptalk/hadoop.html}  		May 2014.  	 
\bibitem[Hadoop et al. (2014)]{b8}\label{b8} 	 		\textit{},  		 			Welcome Hadoop 		,  		 			To Apache 		,  		 			Hadoop 		.  		 \url{http://hadoop.apache.org}  		May 2014.  	 
\bibitem[Mlib (2014)]{b11}\label{b11} 	 		\textit{Apache Spark performance},  		 			Spark Mlib 		.  		 \url{https://spark.apache.org/mllib}  		October 2014.  	 
\bibitem[Big Data: what I is and why it mater ()]{b1}\label{b1} 	 		\textit{Big Data: what I is and why it mater},  		 \url{http://www.sas.com/en\textunderscore us/insights/big-data/what-is-big-data.html}  		2014.  	 
\bibitem[Goldberg ()]{b2}\label{b2} 	 		\textit{Cloud Security Alliance Lists 10 Big data security Challenges},  		 			Michael Goldberg 		.  		 \url{http://data-informed.com/cloud-security-alliance-lists-10-big-data-security-challenges/}  		2012.  	 
\bibitem[Cloudera et al. (2014)]{b0}\label{b0} 	 		\textit{Community effort driving standardization of Apache Spark through expanded role in Hadoop Project},  		 			Databricks Cloudera 		,  		 			Ibm 		,  		 			R Map 		,  		 			Open Sourcestandards 		.  		 \url{http://finance.yahoo.com/news/communityeffortdrivingstandardizationapache162000526.html}  		July 1 2014.  	 
\bibitem[Basu (2014)]{b10}\label{b10} 	 		‘e-papers/big-data-real time health care-analyticswhite paper’.  		 			Abhi Basu 		.  		 \url{http://www.intel.com/content/dam/www/public/uen/documents/whit}  	 	 		\textit{Real-Time Healthcare Analytics on ApacheHadoopusingSparkandShark},  				December 2014.  	 
\bibitem[Hoover ()]{b5}\label{b5} 	 		 			Mark Hoover 		.  		 \url{http://washingtontechnology.com/articles/2013/02/28/big-data-challenges.aspx}  		\textit{Do you know big data's top},  				2013. 9.  	 
\bibitem[Hurst ()]{b4}\label{b4} 	 		 			Steve Hurst 		.  		 \url{http://www.scmagazine.com/top-10-security-challenges-for-2013/article/281519/}  		\textit{To 10 Security Challenges for 2013},  				2013.  	 
\bibitem[Securosis ()]{b3}\label{b3} 	 		 			Securosis 		.  		 \url{https://securosis.com/assets/library/reports/SecuringBigData\textunderscore FINAL.pdf}  		\textit{Securing Big Data: Security Recommendations for Hadoop and No SQL Environment},  				2012.  	 
\bibitem[Stella ()]{b9}\label{b9} 	 		\textit{Spark for Data Science: A Case Study},  		 			Casey Stella 		.  		 \url{http://hortonworks.com/blog/spark-data-science-case-study/}  		2014.  	 
\end{bibitemlist}
 			 		 	 
\end{document}
